xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient
10 // way to create LLVM instructions for OpenMP directives.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16 
17 #include "llvm/Frontend/Atomic/Atomic.h"
18 #include "llvm/Frontend/OpenMP/OMPConstants.h"
19 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
20 #include "llvm/IR/DebugLoc.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Module.h"
23 #include "llvm/IR/ValueMap.h"
24 #include "llvm/Support/Allocator.h"
25 #include "llvm/Support/Compiler.h"
26 #include "llvm/Support/Error.h"
27 #include "llvm/TargetParser/Triple.h"
28 #include <forward_list>
29 #include <map>
30 #include <optional>
31 
32 namespace llvm {
33 class CanonicalLoopInfo;
34 struct TargetRegionEntryInfo;
35 class OffloadEntriesInfoManager;
36 class OpenMPIRBuilder;
37 class Loop;
38 class LoopAnalysis;
39 class LoopInfo;
40 
41 /// Move the instruction after an InsertPoint to the beginning of another
42 /// BasicBlock.
43 ///
44 /// The instructions after \p IP are moved to the beginning of \p New which must
45 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
46 /// \p New will be added such that there is no semantic change. Otherwise, the
47 /// \p IP insert block remains degenerate and it is up to the caller to insert a
48 /// terminator. \p DL is used as the debug location for the branch instruction
49 /// if one is created.
50 LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
51                        bool CreateBranch, DebugLoc DL);
52 
53 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
54 /// insert location will stick to after the instruction before the insertion
55 /// point (instead of moving with the instruction the InsertPoint stores
56 /// internally).
57 LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New,
58                        bool CreateBranch);
59 
60 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate
61 /// (missing the terminator).
62 ///
63 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
64 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
65 /// is true, a branch to the new successor will new created such that
66 /// semantically there is no change; otherwise the block of the insertion point
67 /// remains degenerate and it is the caller's responsibility to insert a
68 /// terminator. \p DL is used as the debug location for the branch instruction
69 /// if one is created. Returns the new successor block.
70 LLVM_ABI BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
71                              DebugLoc DL, llvm::Twine Name = {});
72 
73 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
74 /// degenerate (missing the terminator).  Its new insert location will stick to
75 /// after the instruction before the insertion point (instead of moving with the
76 /// instruction the InsertPoint stores internally).
77 LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
78                              llvm::Twine Name = {});
79 
80 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
81 /// degenerate (missing the terminator).  Its new insert location will stick to
82 /// after the instruction before the insertion point (instead of moving with the
83 /// instruction the InsertPoint stores internally).
84 LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
85                              llvm::Twine Name);
86 
87 /// Like splitBB, but reuses the current block's name for the new name.
88 LLVM_ABI BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder,
89                                        bool CreateBranch,
90                                        llvm::Twine Suffix = ".split");
91 
92 /// Captures attributes that affect generating LLVM-IR using the
93 /// OpenMPIRBuilder and related classes. Note that not all attributes are
94 /// required for all classes or functions. In some use cases the configuration
95 /// is not necessary at all, because because the only functions that are called
96 /// are ones that are not dependent on the configuration.
97 class OpenMPIRBuilderConfig {
98 public:
99   /// Flag to define whether to generate code for the role of the OpenMP host
100   /// (if set to false) or device (if set to true) in an offloading context. It
101   /// is set when the -fopenmp-is-target-device compiler frontend option is
102   /// specified.
103   std::optional<bool> IsTargetDevice;
104 
105   /// Flag for specifying if the compilation is done for an accelerator. It is
106   /// set according to the architecture of the target triple and currently only
107   /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
108   /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
109   /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
110   /// like target with the ability to work as the OpenMP host is added, or if
111   /// the capabilities of the currently supported GPU architectures are
112   /// expanded.
113   std::optional<bool> IsGPU;
114 
115   /// Flag for specifying if LLVMUsed information should be emitted.
116   std::optional<bool> EmitLLVMUsedMetaInfo;
117 
118   /// Flag for specifying if offloading is mandatory.
119   std::optional<bool> OpenMPOffloadMandatory;
120 
121   /// First separator used between the initial two parts of a name.
122   std::optional<StringRef> FirstSeparator;
123   /// Separator used between all of the rest consecutive parts of s name
124   std::optional<StringRef> Separator;
125 
126   // Grid Value for the GPU target
127   std::optional<omp::GV> GridValue;
128 
129   /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
130   /// false`), this contains the list of offloading triples associated, if any.
131   SmallVector<Triple> TargetTriples;
132 
133   LLVM_ABI OpenMPIRBuilderConfig();
134   LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
135                                  bool OpenMPOffloadMandatory,
136                                  bool HasRequiresReverseOffload,
137                                  bool HasRequiresUnifiedAddress,
138                                  bool HasRequiresUnifiedSharedMemory,
139                                  bool HasRequiresDynamicAllocators);
140 
141   // Getters functions that assert if the required values are not present.
isTargetDevice()142   bool isTargetDevice() const {
143     assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
144     return *IsTargetDevice;
145   }
146 
isGPU()147   bool isGPU() const {
148     assert(IsGPU.has_value() && "IsGPU is not set");
149     return *IsGPU;
150   }
151 
openMPOffloadMandatory()152   bool openMPOffloadMandatory() const {
153     assert(OpenMPOffloadMandatory.has_value() &&
154            "OpenMPOffloadMandatory is not set");
155     return *OpenMPOffloadMandatory;
156   }
157 
getGridValue()158   omp::GV getGridValue() const {
159     assert(GridValue.has_value() && "GridValue is not set");
160     return *GridValue;
161   }
162 
hasRequiresFlags()163   bool hasRequiresFlags() const { return RequiresFlags; }
164   LLVM_ABI bool hasRequiresReverseOffload() const;
165   LLVM_ABI bool hasRequiresUnifiedAddress() const;
166   LLVM_ABI bool hasRequiresUnifiedSharedMemory() const;
167   LLVM_ABI bool hasRequiresDynamicAllocators() const;
168 
169   /// Returns requires directive clauses as flags compatible with those expected
170   /// by libomptarget.
171   LLVM_ABI int64_t getRequiresFlags() const;
172 
173   // Returns the FirstSeparator if set, otherwise use the default separator
174   // depending on isGPU
firstSeparator()175   StringRef firstSeparator() const {
176     if (FirstSeparator.has_value())
177       return *FirstSeparator;
178     if (isGPU())
179       return "_";
180     return ".";
181   }
182 
183   // Returns the Separator if set, otherwise use the default separator depending
184   // on isGPU
separator()185   StringRef separator() const {
186     if (Separator.has_value())
187       return *Separator;
188     if (isGPU())
189       return "$";
190     return ".";
191   }
192 
setIsTargetDevice(bool Value)193   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
setIsGPU(bool Value)194   void setIsGPU(bool Value) { IsGPU = Value; }
195   void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
setOpenMPOffloadMandatory(bool Value)196   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
setFirstSeparator(StringRef FS)197   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
setSeparator(StringRef S)198   void setSeparator(StringRef S) { Separator = S; }
setGridValue(omp::GV G)199   void setGridValue(omp::GV G) { GridValue = G; }
200 
201   LLVM_ABI void setHasRequiresReverseOffload(bool Value);
202   LLVM_ABI void setHasRequiresUnifiedAddress(bool Value);
203   LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value);
204   LLVM_ABI void setHasRequiresDynamicAllocators(bool Value);
205 
206 private:
207   /// Flags for specifying which requires directive clauses are present.
208   int64_t RequiresFlags;
209 };
210 
211 /// Data structure to contain the information needed to uniquely identify
212 /// a target entry.
213 struct TargetRegionEntryInfo {
214   /// The prefix used for kernel names.
215   static constexpr const char *KernelNamePrefix = "__omp_offloading_";
216 
217   std::string ParentName;
218   unsigned DeviceID;
219   unsigned FileID;
220   unsigned Line;
221   unsigned Count;
222 
TargetRegionEntryInfoTargetRegionEntryInfo223   TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
224   TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
225                         unsigned FileID, unsigned Line, unsigned Count = 0)
ParentNameTargetRegionEntryInfo226       : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
227         Count(Count) {}
228 
229   LLVM_ABI static void
230   getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, StringRef ParentName,
231                              unsigned DeviceID, unsigned FileID, unsigned Line,
232                              unsigned Count);
233 
234   bool operator<(const TargetRegionEntryInfo &RHS) const {
235     return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
236            std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
237                            RHS.Count);
238   }
239 };
240 
241 /// Class that manages information about offload code regions and data
242 class OffloadEntriesInfoManager {
243   /// Number of entries registered so far.
244   OpenMPIRBuilder *OMPBuilder;
245   unsigned OffloadingEntriesNum = 0;
246 
247 public:
248   /// Base class of the entries info.
249   class OffloadEntryInfo {
250   public:
251     /// Kind of a given entry.
252     enum OffloadingEntryInfoKinds : unsigned {
253       /// Entry is a target region.
254       OffloadingEntryInfoTargetRegion = 0,
255       /// Entry is a declare target variable.
256       OffloadingEntryInfoDeviceGlobalVar = 1,
257       /// Invalid entry info.
258       OffloadingEntryInfoInvalid = ~0u
259     };
260 
261   protected:
262     OffloadEntryInfo() = delete;
OffloadEntryInfo(OffloadingEntryInfoKinds Kind)263     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
OffloadEntryInfo(OffloadingEntryInfoKinds Kind,unsigned Order,uint32_t Flags)264     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
265                               uint32_t Flags)
266         : Flags(Flags), Order(Order), Kind(Kind) {}
267     ~OffloadEntryInfo() = default;
268 
269   public:
isValid()270     bool isValid() const { return Order != ~0u; }
getOrder()271     unsigned getOrder() const { return Order; }
getKind()272     OffloadingEntryInfoKinds getKind() const { return Kind; }
getFlags()273     uint32_t getFlags() const { return Flags; }
setFlags(uint32_t NewFlags)274     void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
getAddress()275     Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
setAddress(Constant * V)276     void setAddress(Constant *V) {
277       assert(!Addr.pointsToAliveValue() && "Address has been set before!");
278       Addr = V;
279     }
classof(const OffloadEntryInfo * Info)280     static bool classof(const OffloadEntryInfo *Info) { return true; }
281 
282   private:
283     /// Address of the entity that has to be mapped for offloading.
284     WeakTrackingVH Addr;
285 
286     /// Flags associated with the device global.
287     uint32_t Flags = 0u;
288 
289     /// Order this entry was emitted.
290     unsigned Order = ~0u;
291 
292     OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
293   };
294 
295   /// Return true if a there are no entries defined.
296   LLVM_ABI bool empty() const;
297   /// Return number of entries defined so far.
size()298   unsigned size() const { return OffloadingEntriesNum; }
299 
OffloadEntriesInfoManager(OpenMPIRBuilder * builder)300   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
301 
302   //
303   // Target region entries related.
304   //
305 
306   /// Kind of the target registry entry.
307   enum OMPTargetRegionEntryKind : uint32_t {
308     /// Mark the entry as target region.
309     OMPTargetRegionEntryTargetRegion = 0x0,
310   };
311 
312   /// Target region entries info.
313   class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
314     /// Address that can be used as the ID of the entry.
315     Constant *ID = nullptr;
316 
317   public:
OffloadEntryInfoTargetRegion()318     OffloadEntryInfoTargetRegion()
319         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
OffloadEntryInfoTargetRegion(unsigned Order,Constant * Addr,Constant * ID,OMPTargetRegionEntryKind Flags)320     explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
321                                           Constant *ID,
322                                           OMPTargetRegionEntryKind Flags)
323         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
324           ID(ID) {
325       setAddress(Addr);
326     }
327 
getID()328     Constant *getID() const { return ID; }
setID(Constant * V)329     void setID(Constant *V) {
330       assert(!ID && "ID has been set before!");
331       ID = V;
332     }
classof(const OffloadEntryInfo * Info)333     static bool classof(const OffloadEntryInfo *Info) {
334       return Info->getKind() == OffloadingEntryInfoTargetRegion;
335     }
336   };
337 
338   /// Initialize target region entry.
339   /// This is ONLY needed for DEVICE compilation.
340   LLVM_ABI void
341   initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
342                                   unsigned Order);
343   /// Register target region entry.
344   LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
345                                               Constant *Addr, Constant *ID,
346                                               OMPTargetRegionEntryKind Flags);
347   /// Return true if a target region entry with the provided information
348   /// exists.
349   LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
350                                          bool IgnoreAddressId = false) const;
351 
352   // Return the Name based on \a EntryInfo using the next available Count.
353   LLVM_ABI void
354   getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
355                              const TargetRegionEntryInfo &EntryInfo);
356 
357   /// brief Applies action \a Action on all registered entries.
358   typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
359                             const OffloadEntryInfoTargetRegion &)>
360       OffloadTargetRegionEntryInfoActTy;
361   LLVM_ABI void
362   actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
363 
364   //
365   // Device global variable entries related.
366   //
367 
368   /// Kind of the global variable entry..
369   enum OMPTargetGlobalVarEntryKind : uint32_t {
370     /// Mark the entry as a to declare target.
371     OMPTargetGlobalVarEntryTo = 0x0,
372     /// Mark the entry as a to declare target link.
373     OMPTargetGlobalVarEntryLink = 0x1,
374     /// Mark the entry as a declare target enter.
375     OMPTargetGlobalVarEntryEnter = 0x2,
376     /// Mark the entry as having no declare target entry kind.
377     OMPTargetGlobalVarEntryNone = 0x3,
378     /// Mark the entry as a declare target indirect global.
379     OMPTargetGlobalVarEntryIndirect = 0x8,
380     /// Mark the entry as a register requires global.
381     OMPTargetGlobalRegisterRequires = 0x10,
382   };
383 
384   /// Kind of device clause for declare target variables
385   /// and functions
386   /// NOTE: Currently not used as a part of a variable entry
387   /// used for Flang and Clang to interface with the variable
388   /// related registration functions
389   enum OMPTargetDeviceClauseKind : uint32_t {
390     /// The target is marked for all devices
391     OMPTargetDeviceClauseAny = 0x0,
392     /// The target is marked for non-host devices
393     OMPTargetDeviceClauseNoHost = 0x1,
394     /// The target is marked for host devices
395     OMPTargetDeviceClauseHost = 0x2,
396     /// The target is marked as having no clause
397     OMPTargetDeviceClauseNone = 0x3
398   };
399 
400   /// Device global variable entries info.
401   class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
402     /// Type of the global variable.
403     int64_t VarSize;
404     GlobalValue::LinkageTypes Linkage;
405     const std::string VarName;
406 
407   public:
OffloadEntryInfoDeviceGlobalVar()408     OffloadEntryInfoDeviceGlobalVar()
409         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
OffloadEntryInfoDeviceGlobalVar(unsigned Order,OMPTargetGlobalVarEntryKind Flags)410     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
411                                              OMPTargetGlobalVarEntryKind Flags)
412         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
OffloadEntryInfoDeviceGlobalVar(unsigned Order,Constant * Addr,int64_t VarSize,OMPTargetGlobalVarEntryKind Flags,GlobalValue::LinkageTypes Linkage,const std::string & VarName)413     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
414                                              int64_t VarSize,
415                                              OMPTargetGlobalVarEntryKind Flags,
416                                              GlobalValue::LinkageTypes Linkage,
417                                              const std::string &VarName)
418         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
419           VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
420       setAddress(Addr);
421     }
422 
getVarSize()423     int64_t getVarSize() const { return VarSize; }
getVarName()424     StringRef getVarName() const { return VarName; }
setVarSize(int64_t Size)425     void setVarSize(int64_t Size) { VarSize = Size; }
getLinkage()426     GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
setLinkage(GlobalValue::LinkageTypes LT)427     void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
classof(const OffloadEntryInfo * Info)428     static bool classof(const OffloadEntryInfo *Info) {
429       return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
430     }
431   };
432 
433   /// Initialize device global variable entry.
434   /// This is ONLY used for DEVICE compilation.
435   LLVM_ABI void initializeDeviceGlobalVarEntryInfo(
436       StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order);
437 
438   /// Register device global variable entry.
439   LLVM_ABI void registerDeviceGlobalVarEntryInfo(
440       StringRef VarName, Constant *Addr, int64_t VarSize,
441       OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage);
442   /// Checks if the variable with the given name has been registered already.
hasDeviceGlobalVarEntryInfo(StringRef VarName)443   bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
444     return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
445   }
446   /// Applies action \a Action on all registered entries.
447   typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
448       OffloadDeviceGlobalVarEntryInfoActTy;
449   LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(
450       const OffloadDeviceGlobalVarEntryInfoActTy &Action);
451 
452 private:
453   /// Return the count of entries at a particular source location.
454   unsigned
455   getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
456 
457   /// Update the count of entries at a particular source location.
458   void
459   incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
460 
461   static TargetRegionEntryInfo
getTargetRegionEntryCountKey(const TargetRegionEntryInfo & EntryInfo)462   getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
463     return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
464                                  EntryInfo.FileID, EntryInfo.Line, 0);
465   }
466 
467   // Count of entries at a location.
468   std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
469 
470   // Storage for target region entries kind.
471   typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
472       OffloadEntriesTargetRegionTy;
473   OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
474   /// Storage for device global variable entries kind. The storage is to be
475   /// indexed by mangled name.
476   typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
477       OffloadEntriesDeviceGlobalVarTy;
478   OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
479 };
480 
481 /// An interface to create LLVM-IR for OpenMP directives.
482 ///
483 /// Each OpenMP directive has a corresponding public generator method.
484 class OpenMPIRBuilder {
485 public:
486   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
487   /// not have an effect on \p M (see initialize)
OpenMPIRBuilder(Module & M)488   OpenMPIRBuilder(Module &M)
489       : M(M), Builder(M.getContext()), OffloadInfoManager(this),
490         T(M.getTargetTriple()), IsFinalized(false) {}
491   LLVM_ABI ~OpenMPIRBuilder();
492 
493   class AtomicInfo : public llvm::AtomicInfo {
494     llvm::Value *AtomicVar;
495 
496   public:
AtomicInfo(IRBuilder<> * Builder,llvm::Type * Ty,uint64_t AtomicSizeInBits,uint64_t ValueSizeInBits,llvm::Align AtomicAlign,llvm::Align ValueAlign,bool UseLibcall,IRBuilderBase::InsertPoint AllocaIP,llvm::Value * AtomicVar)497     AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits,
498                uint64_t ValueSizeInBits, llvm::Align AtomicAlign,
499                llvm::Align ValueAlign, bool UseLibcall,
500                IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar)
501         : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits,
502                            AtomicAlign, ValueAlign, UseLibcall, AllocaIP),
503           AtomicVar(AtomicVar) {}
504 
getAtomicPointer()505     llvm::Value *getAtomicPointer() const override { return AtomicVar; }
decorateWithTBAA(llvm::Instruction * I)506     void decorateWithTBAA(llvm::Instruction *I) override {}
CreateAlloca(llvm::Type * Ty,const llvm::Twine & Name)507     llvm::AllocaInst *CreateAlloca(llvm::Type *Ty,
508                                    const llvm::Twine &Name) const override {
509       llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
510       allocaInst->setName(Name);
511       return allocaInst;
512     }
513   };
514   /// Initialize the internal state, this will put structures types and
515   /// potentially other helpers into the underlying module. Must be called
516   /// before any other method and only once! This internal state includes types
517   /// used in the OpenMPIRBuilder generated from OMPKinds.def.
518   LLVM_ABI void initialize();
519 
setConfig(OpenMPIRBuilderConfig C)520   void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
521 
522   /// Finalize the underlying module, e.g., by outlining regions.
523   /// \param Fn                    The function to be finalized. If not used,
524   ///                              all functions are finalized.
525   LLVM_ABI void finalize(Function *Fn = nullptr);
526 
527   /// Check whether the finalize function has already run
528   /// \return true if the finalize function has already run
529   LLVM_ABI bool isFinalized();
530 
531   /// Add attributes known for \p FnID to \p Fn.
532   LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
533 
534   /// Type used throughout for insertion points.
535   using InsertPointTy = IRBuilder<>::InsertPoint;
536 
537   /// Type used to represent an insertion point or an error value.
538   using InsertPointOrErrorTy = Expected<InsertPointTy>;
539 
540   /// Get the create a name using the platform specific separators.
541   /// \param Parts parts of the final name that needs separation
542   /// The created name has a first separator between the first and second part
543   /// and a second separator between all other parts.
544   /// E.g. with FirstSeparator "$" and Separator "." and
545   /// parts: "p1", "p2", "p3", "p4"
546   /// The resulting name is "p1$p2.p3.p4"
547   /// The separators are retrieved from the OpenMPIRBuilderConfig.
548   LLVM_ABI std::string
549   createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
550 
551   /// Callback type for variable finalization (think destructors).
552   ///
553   /// \param CodeGenIP is the insertion point at which the finalization code
554   ///                  should be placed.
555   ///
556   /// A finalize callback knows about all objects that need finalization, e.g.
557   /// destruction, when the scope of the currently generated construct is left
558   /// at the time, and location, the callback is invoked.
559   using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
560 
561   struct FinalizationInfo {
562     /// The finalization callback provided by the last in-flight invocation of
563     /// createXXXX for the directive of kind DK.
564     FinalizeCallbackTy FiniCB;
565 
566     /// The directive kind of the innermost directive that has an associated
567     /// region which might require finalization when it is left.
568     omp::Directive DK;
569 
570     /// Flag to indicate if the directive is cancellable.
571     bool IsCancellable;
572   };
573 
574   /// Push a finalization callback on the finalization stack.
575   ///
576   /// NOTE: Temporary solution until Clang CG is gone.
pushFinalizationCB(const FinalizationInfo & FI)577   void pushFinalizationCB(const FinalizationInfo &FI) {
578     FinalizationStack.push_back(FI);
579   }
580 
581   /// Pop the last finalization callback from the finalization stack.
582   ///
583   /// NOTE: Temporary solution until Clang CG is gone.
popFinalizationCB()584   void popFinalizationCB() { FinalizationStack.pop_back(); }
585 
586   /// Callback type for body (=inner region) code generation
587   ///
588   /// The callback takes code locations as arguments, each describing a
589   /// location where additional instructions can be inserted.
590   ///
591   /// The CodeGenIP may be in the middle of a basic block or point to the end of
592   /// it. The basic block may have a terminator or be degenerate. The callback
593   /// function may just insert instructions at that position, but also split the
594   /// block (without the Before argument of BasicBlock::splitBasicBlock such
595   /// that the identify of the split predecessor block is preserved) and insert
596   /// additional control flow, including branches that do not lead back to what
597   /// follows the CodeGenIP. Note that since the callback is allowed to split
598   /// the block, callers must assume that InsertPoints to positions in the
599   /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
600   /// such InsertPoints need to be preserved, it can split the block itself
601   /// before calling the callback.
602   ///
603   /// AllocaIP and CodeGenIP must not point to the same position.
604   ///
605   /// \param AllocaIP is the insertion point at which new alloca instructions
606   ///                 should be placed. The BasicBlock it is pointing to must
607   ///                 not be split.
608   /// \param CodeGenIP is the insertion point at which the body code should be
609   ///                  placed.
610   ///
611   /// \return an error, if any were triggered during execution.
612   using BodyGenCallbackTy =
613       function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
614 
615   // This is created primarily for sections construct as llvm::function_ref
616   // (BodyGenCallbackTy) is not storable (as described in the comments of
617   // function_ref class - function_ref contains non-ownable reference
618   // to the callable.
619   ///
620   /// \return an error, if any were triggered during execution.
621   using StorableBodyGenCallbackTy =
622       std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
623 
624   /// Callback type for loop body code generation.
625   ///
626   /// \param CodeGenIP is the insertion point where the loop's body code must be
627   ///                  placed. This will be a dedicated BasicBlock with a
628   ///                  conditional branch from the loop condition check and
629   ///                  terminated with an unconditional branch to the loop
630   ///                  latch.
631   /// \param IndVar    is the induction variable usable at the insertion point.
632   ///
633   /// \return an error, if any were triggered during execution.
634   using LoopBodyGenCallbackTy =
635       function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
636 
637   /// Callback type for variable privatization (think copy & default
638   /// constructor).
639   ///
640   /// \param AllocaIP is the insertion point at which new alloca instructions
641   ///                 should be placed.
642   /// \param CodeGenIP is the insertion point at which the privatization code
643   ///                  should be placed.
644   /// \param Original The value being copied/created, should not be used in the
645   ///                 generated IR.
646   /// \param Inner The equivalent of \p Original that should be used in the
647   ///              generated IR; this is equal to \p Original if the value is
648   ///              a pointer and can thus be passed directly, otherwise it is
649   ///              an equivalent but different value.
650   /// \param ReplVal The replacement value, thus a copy or new created version
651   ///                of \p Inner.
652   ///
653   /// \returns The new insertion point where code generation continues and
654   ///          \p ReplVal the replacement value.
655   using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy(
656       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
657       Value &Inner, Value *&ReplVal)>;
658 
659   /// Description of a LLVM-IR insertion point (IP) and a debug/source location
660   /// (filename, line, column, ...).
661   struct LocationDescription {
LocationDescriptionLocationDescription662     LocationDescription(const IRBuilderBase &IRB)
663         : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
LocationDescriptionLocationDescription664     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
LocationDescriptionLocationDescription665     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
666         : IP(IP), DL(DL) {}
667     InsertPointTy IP;
668     DebugLoc DL;
669   };
670 
671   /// Emitter methods for OpenMP directives.
672   ///
673   ///{
674 
675   /// Generator for '#omp barrier'
676   ///
677   /// \param Loc The location where the barrier directive was encountered.
678   /// \param Kind The kind of directive that caused the barrier.
679   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
680   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
681   ///                        should be checked and acted upon.
682   /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
683   ///
684   /// \returns The insertion point after the barrier.
685   LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc,
686                                               omp::Directive Kind,
687                                               bool ForceSimpleCall = false,
688                                               bool CheckCancelFlag = true);
689 
690   /// Generator for '#omp cancel'
691   ///
692   /// \param Loc The location where the directive was encountered.
693   /// \param IfCondition The evaluated 'if' clause expression, if any.
694   /// \param CanceledDirective The kind of directive that is cancled.
695   ///
696   /// \returns The insertion point after the barrier.
697   LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc,
698                                              Value *IfCondition,
699                                              omp::Directive CanceledDirective);
700 
701   /// Generator for '#omp cancellation point'
702   ///
703   /// \param Loc The location where the directive was encountered.
704   /// \param CanceledDirective The kind of directive that is cancled.
705   ///
706   /// \returns The insertion point after the barrier.
707   LLVM_ABI InsertPointOrErrorTy createCancellationPoint(
708       const LocationDescription &Loc, omp::Directive CanceledDirective);
709 
710   /// Generator for '#omp parallel'
711   ///
712   /// \param Loc The insert and source location description.
713   /// \param AllocaIP The insertion points to be used for alloca instructions.
714   /// \param BodyGenCB Callback that will generate the region code.
715   /// \param PrivCB Callback to copy a given variable (think copy constructor).
716   /// \param FiniCB Callback to finalize variable copies.
717   /// \param IfCondition The evaluated 'if' clause expression, if any.
718   /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
719   /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
720   /// \param IsCancellable Flag to indicate a cancellable parallel region.
721   ///
722   /// \returns The insertion position *after* the parallel.
723   LLVM_ABI InsertPointOrErrorTy createParallel(
724       const LocationDescription &Loc, InsertPointTy AllocaIP,
725       BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
726       FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
727       omp::ProcBindKind ProcBind, bool IsCancellable);
728 
729   /// Generator for the control flow structure of an OpenMP canonical loop.
730   ///
731   /// This generator operates on the logical iteration space of the loop, i.e.
732   /// the caller only has to provide a loop trip count of the loop as defined by
733   /// base language semantics. The trip count is interpreted as an unsigned
734   /// integer. The induction variable passed to \p BodyGenCB will be of the same
735   /// type and run from 0 to \p TripCount - 1. It is up to the callback to
736   /// convert the logical iteration variable to the loop counter variable in the
737   /// loop body.
738   ///
739   /// \param Loc       The insert and source location description. The insert
740   ///                  location can be between two instructions or the end of a
741   ///                  degenerate block (e.g. a BB under construction).
742   /// \param BodyGenCB Callback that will generate the loop body code.
743   /// \param TripCount Number of iterations the loop body is executed.
744   /// \param Name      Base name used to derive BB and instruction names.
745   ///
746   /// \returns An object representing the created control flow structure which
747   ///          can be used for loop-associated directives.
748   LLVM_ABI Expected<CanonicalLoopInfo *>
749   createCanonicalLoop(const LocationDescription &Loc,
750                       LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
751                       const Twine &Name = "loop");
752 
753   /// Calculate the trip count of a canonical loop.
754   ///
755   /// This allows specifying user-defined loop counter values using increment,
756   /// upper- and lower bounds. To disambiguate the terminology when counting
757   /// downwards, instead of lower bounds we use \p Start for the loop counter
758   /// value in the first body iteration.
759   ///
760   /// Consider the following limitations:
761   ///
762   ///  * A loop counter space over all integer values of its bit-width cannot be
763   ///    represented. E.g using uint8_t, its loop trip count of 256 cannot be
764   ///    stored into an 8 bit integer):
765   ///
766   ///      DO I = 0, 255, 1
767   ///
768   ///  * Unsigned wrapping is only supported when wrapping only "once"; E.g.
769   ///    effectively counting downwards:
770   ///
771   ///      for (uint8_t i = 100u; i > 0; i += 127u)
772   ///
773   ///
774   /// TODO: May need to add additional parameters to represent:
775   ///
776   ///  * Allow representing downcounting with unsigned integers.
777   ///
778   ///  * Sign of the step and the comparison operator might disagree:
779   ///
780   ///      for (int i = 0; i < 42; i -= 1u)
781   ///
782   /// \param Loc       The insert and source location description.
783   /// \param Start     Value of the loop counter for the first iterations.
784   /// \param Stop      Loop counter values past this will stop the loop.
785   /// \param Step      Loop counter increment after each iteration; negative
786   ///                  means counting down.
787   /// \param IsSigned  Whether Start, Stop and Step are signed integers.
788   /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
789   ///                      counter.
790   /// \param Name      Base name used to derive instruction names.
791   ///
792   /// \returns The value holding the calculated trip count.
793   LLVM_ABI Value *calculateCanonicalLoopTripCount(
794       const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
795       bool IsSigned, bool InclusiveStop, const Twine &Name = "loop");
796 
797   /// Generator for the control flow structure of an OpenMP canonical loop.
798   ///
799   /// Instead of a logical iteration space, this allows specifying user-defined
800   /// loop counter values using increment, upper- and lower bounds. To
801   /// disambiguate the terminology when counting downwards, instead of lower
802   /// bounds we use \p Start for the loop counter value in the first body
803   ///
804   /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
805   /// so limitations of that method apply here as well.
806   ///
807   /// \param Loc       The insert and source location description.
808   /// \param BodyGenCB Callback that will generate the loop body code.
809   /// \param Start     Value of the loop counter for the first iterations.
810   /// \param Stop      Loop counter values past this will stop the loop.
811   /// \param Step      Loop counter increment after each iteration; negative
812   ///                  means counting down.
813   /// \param IsSigned  Whether Start, Stop and Step are signed integers.
814   /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
815   ///                      counter.
816   /// \param ComputeIP Insertion point for instructions computing the trip
817   ///                  count. Can be used to ensure the trip count is available
818   ///                  at the outermost loop of a loop nest. If not set,
819   ///                  defaults to the preheader of the generated loop.
820   /// \param Name      Base name used to derive BB and instruction names.
821   ///
822   /// \returns An object representing the created control flow structure which
823   ///          can be used for loop-associated directives.
824   LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop(
825       const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
826       Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
827       InsertPointTy ComputeIP = {}, const Twine &Name = "loop");
828 
829   /// Collapse a loop nest into a single loop.
830   ///
831   /// Merges loops of a loop nest into a single CanonicalLoopNest representation
832   /// that has the same number of innermost loop iterations as the origin loop
833   /// nest. The induction variables of the input loops are derived from the
834   /// collapsed loop's induction variable. This is intended to be used to
835   /// implement OpenMP's collapse clause. Before applying a directive,
836   /// collapseLoops normalizes a loop nest to contain only a single loop and the
837   /// directive's implementation does not need to handle multiple loops itself.
838   /// This does not remove the need to handle all loop nest handling by
839   /// directives, such as the ordered(<n>) clause or the simd schedule-clause
840   /// modifier of the worksharing-loop directive.
841   ///
842   /// Example:
843   /// \code
844   ///   for (int i = 0; i < 7; ++i) // Canonical loop "i"
845   ///     for (int j = 0; j < 9; ++j) // Canonical loop "j"
846   ///       body(i, j);
847   /// \endcode
848   ///
849   /// After collapsing with Loops={i,j}, the loop is changed to
850   /// \code
851   ///   for (int ij = 0; ij < 63; ++ij) {
852   ///     int i = ij / 9;
853   ///     int j = ij % 9;
854   ///     body(i, j);
855   ///   }
856   /// \endcode
857   ///
858   /// In the current implementation, the following limitations apply:
859   ///
860   ///  * All input loops have an induction variable of the same type.
861   ///
862   ///  * The collapsed loop will have the same trip count integer type as the
863   ///    input loops. Therefore it is possible that the collapsed loop cannot
864   ///    represent all iterations of the input loops. For instance, assuming a
865   ///    32 bit integer type, and two input loops both iterating 2^16 times, the
866   ///    theoretical trip count of the collapsed loop would be 2^32 iteration,
867   ///    which cannot be represented in an 32-bit integer. Behavior is undefined
868   ///    in this case.
869   ///
870   ///  * The trip counts of every input loop must be available at \p ComputeIP.
871   ///    Non-rectangular loops are not yet supported.
872   ///
873   ///  * At each nest level, code between a surrounding loop and its nested loop
874   ///    is hoisted into the loop body, and such code will be executed more
875   ///    often than before collapsing (or not at all if any inner loop iteration
876   ///    has a trip count of 0). This is permitted by the OpenMP specification.
877   ///
878   /// \param DL        Debug location for instructions added for collapsing,
879   ///                  such as instructions to compute/derive the input loop's
880   ///                  induction variables.
881   /// \param Loops     Loops in the loop nest to collapse. Loops are specified
882   ///                  from outermost-to-innermost and every control flow of a
883   ///                  loop's body must pass through its directly nested loop.
884   /// \param ComputeIP Where additional instruction that compute the collapsed
885   ///                  trip count. If not set, defaults to before the generated
886   ///                  loop.
887   ///
888   /// \returns The CanonicalLoopInfo object representing the collapsed loop.
889   LLVM_ABI CanonicalLoopInfo *collapseLoops(DebugLoc DL,
890                                             ArrayRef<CanonicalLoopInfo *> Loops,
891                                             InsertPointTy ComputeIP);
892 
893   /// Get the default alignment value for given target
894   ///
895   /// \param TargetTriple   Target triple
896   /// \param Features       StringMap which describes extra CPU features
897   LLVM_ABI static unsigned
898   getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
899                             const StringMap<bool> &Features);
900 
901   /// Retrieve (or create if non-existent) the address of a declare
902   /// target variable, used in conjunction with registerTargetGlobalVariable
903   /// to create declare target global variables.
904   ///
905   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
906   /// clause used in conjunction with the variable being registered (link,
907   /// to, enter).
908   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
909   /// clause used in conjunction with the variable being registered (nohost,
910   /// host, any)
911   /// \param IsDeclaration - boolean stating if the variable being registered
912   /// is a declaration-only and not a definition
913   /// \param IsExternallyVisible - boolean stating if the variable is externally
914   /// visible
915   /// \param EntryInfo - Unique entry information for the value generated
916   /// using getTargetEntryUniqueInfo, used to name generated pointer references
917   /// to the declare target variable
918   /// \param MangledName - the mangled name of the variable being registered
919   /// \param GeneratedRefs - references generated by invocations of
920   /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
921   /// these are required by Clang for book keeping.
922   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
923   /// \param TargetTriple - The OpenMP device target triple we are compiling
924   /// for
925   /// \param LlvmPtrTy - The type of the variable we are generating or
926   /// retrieving an address for
927   /// \param GlobalInitializer - a lambda function which creates a constant
928   /// used for initializing a pointer reference to the variable in certain
929   /// cases. If a nullptr is passed, it will default to utilising the original
930   /// variable to initialize the pointer reference.
931   /// \param VariableLinkage - a lambda function which returns the variables
932   /// linkage type, if unspecified and a nullptr is given, it will instead
933   /// utilise the linkage stored on the existing global variable in the
934   /// LLVMModule.
935   LLVM_ABI Constant *getAddrOfDeclareTargetVar(
936       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
937       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
938       bool IsDeclaration, bool IsExternallyVisible,
939       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
940       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
941       std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
942       std::function<Constant *()> GlobalInitializer,
943       std::function<GlobalValue::LinkageTypes()> VariableLinkage);
944 
945   /// Registers a target variable for device or host.
946   ///
947   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
948   /// clause used in conjunction with the variable being registered (link,
949   /// to, enter).
950   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
951   /// clause used in conjunction with the variable being registered (nohost,
952   /// host, any)
953   /// \param IsDeclaration - boolean stating if the variable being registered
954   /// is a declaration-only and not a definition
955   /// \param IsExternallyVisible - boolean stating if the variable is externally
956   /// visible
957   /// \param EntryInfo - Unique entry information for the value generated
958   /// using getTargetEntryUniqueInfo, used to name generated pointer references
959   /// to the declare target variable
960   /// \param MangledName - the mangled name of the variable being registered
961   /// \param GeneratedRefs - references generated by invocations of
962   /// registerTargetGlobalVariable these are required by Clang for book
963   /// keeping.
964   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
965   /// \param TargetTriple - The OpenMP device target triple we are compiling
966   /// for
967   /// \param GlobalInitializer - a lambda function which creates a constant
968   /// used for initializing a pointer reference to the variable in certain
969   /// cases. If a nullptr is passed, it will default to utilising the original
970   /// variable to initialize the pointer reference.
971   /// \param VariableLinkage - a lambda function which returns the variables
972   /// linkage type, if unspecified and a nullptr is given, it will instead
973   /// utilise the linkage stored on the existing global variable in the
974   /// LLVMModule.
975   /// \param LlvmPtrTy - The type of the variable we are generating or
976   /// retrieving an address for
977   /// \param Addr - the original llvm value (addr) of the variable to be
978   /// registered
979   LLVM_ABI void registerTargetGlobalVariable(
980       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
981       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
982       bool IsDeclaration, bool IsExternallyVisible,
983       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
984       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
985       std::vector<Triple> TargetTriple,
986       std::function<Constant *()> GlobalInitializer,
987       std::function<GlobalValue::LinkageTypes()> VariableLinkage,
988       Type *LlvmPtrTy, Constant *Addr);
989 
990   /// Get the offset of the OMP_MAP_MEMBER_OF field.
991   LLVM_ABI unsigned getFlagMemberOffset();
992 
993   /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
994   /// the position given.
995   /// \param Position - A value indicating the position of the parent
996   /// of the member in the kernel argument structure, often retrieved
997   /// by the parents position in the combined information vectors used
998   /// to generate the structure itself. Multiple children (member's of)
999   /// with the same parent will use the same returned member flag.
1000   LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
1001 
1002   /// Given an initial flag set, this function modifies it to contain
1003   /// the passed in MemberOfFlag generated from the getMemberOfFlag
1004   /// function. The results are dependent on the existing flag bits
1005   /// set in the original flag set.
1006   /// \param Flags - The original set of flags to be modified with the
1007   /// passed in MemberOfFlag.
1008   /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
1009   /// slightly based on the getMemberOfFlag which adjusts the flag bits
1010   /// based on the members position in its parent.
1011   LLVM_ABI void
1012   setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
1013                          omp::OpenMPOffloadMappingFlags MemberOfFlag);
1014 
1015 private:
1016   /// Modifies the canonical loop to be a statically-scheduled workshare loop
1017   /// which is executed on the device
1018   ///
1019   /// This takes a \p CLI representing a canonical loop, such as the one
1020   /// created by \see createCanonicalLoop and emits additional instructions to
1021   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1022   /// runtime function in the preheader to call OpenMP device rtl function
1023   /// which handles worksharing of loop body interations.
1024   ///
1025   /// \param DL       Debug location for instructions added for the
1026   ///                 workshare-loop construct itself.
1027   /// \param CLI      A descriptor of the canonical loop to workshare.
1028   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1029   ///                 preheader of the loop.
1030   /// \param LoopType Information about type of loop worksharing.
1031   ///                 It corresponds to type of loop workshare OpenMP pragma.
1032   ///
1033   /// \returns Point where to insert code after the workshare construct.
1034   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
1035                                          InsertPointTy AllocaIP,
1036                                          omp::WorksharingLoopType LoopType);
1037 
1038   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
1039   ///
1040   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1041   /// created by \p createCanonicalLoop and emits additional instructions to
1042   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1043   /// runtime function in the preheader to obtain the loop bounds to be used in
1044   /// the current thread, updates the relevant instructions in the canonical
1045   /// loop and calls to an OpenMP runtime finalization function after the loop.
1046   ///
1047   /// \param DL       Debug location for instructions added for the
1048   ///                 workshare-loop construct itself.
1049   /// \param CLI      A descriptor of the canonical loop to workshare.
1050   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1051   ///                 preheader of the loop.
1052   /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1053   ///                     the loop.
1054   /// \param LoopType Type of workshare loop.
1055   ///
1056   /// \returns Point where to insert code after the workshare construct.
1057   InsertPointOrErrorTy applyStaticWorkshareLoop(
1058       DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1059       omp::WorksharingLoopType LoopType, bool NeedsBarrier);
1060 
1061   /// Modifies the canonical loop a statically-scheduled workshare loop with a
1062   /// user-specified chunk size.
1063   ///
1064   /// \param DL           Debug location for instructions added for the
1065   ///                     workshare-loop construct itself.
1066   /// \param CLI          A descriptor of the canonical loop to workshare.
1067   /// \param AllocaIP     An insertion point for Alloca instructions usable in
1068   ///                     the preheader of the loop.
1069   /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1070   ///                     loop.
1071   /// \param ChunkSize    The user-specified chunk size.
1072   ///
1073   /// \returns Point where to insert code after the workshare construct.
1074   InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
1075                                                        CanonicalLoopInfo *CLI,
1076                                                        InsertPointTy AllocaIP,
1077                                                        bool NeedsBarrier,
1078                                                        Value *ChunkSize);
1079 
1080   /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1081   ///
1082   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1083   /// created by \p createCanonicalLoop and emits additional instructions to
1084   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1085   /// runtime function in the preheader to obtain, and then in each iteration
1086   /// to update the loop counter.
1087   ///
1088   /// \param DL       Debug location for instructions added for the
1089   ///                 workshare-loop construct itself.
1090   /// \param CLI      A descriptor of the canonical loop to workshare.
1091   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1092   ///                 preheader of the loop.
1093   /// \param SchedType Type of scheduling to be passed to the init function.
1094   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1095   ///                     the loop.
1096   /// \param Chunk    The size of loop chunk considered as a unit when
1097   ///                 scheduling. If \p nullptr, defaults to 1.
1098   ///
1099   /// \returns Point where to insert code after the workshare construct.
1100   InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1101                                                  CanonicalLoopInfo *CLI,
1102                                                  InsertPointTy AllocaIP,
1103                                                  omp::OMPScheduleType SchedType,
1104                                                  bool NeedsBarrier,
1105                                                  Value *Chunk = nullptr);
1106 
1107   /// Create alternative version of the loop to support if clause
1108   ///
1109   /// OpenMP if clause can require to generate second loop. This loop
1110   /// will be executed when if clause condition is not met. createIfVersion
1111   /// adds branch instruction to the copied loop if \p  ifCond is not met.
1112   ///
1113   /// \param Loop       Original loop which should be versioned.
1114   /// \param IfCond     Value which corresponds to if clause condition
1115   /// \param VMap       Value to value map to define relation between
1116   ///                   original and copied loop values and loop blocks.
1117   /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1118   void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1119                        ValueMap<const Value *, WeakTrackingVH> &VMap,
1120                        LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
1121                        const Twine &NamePrefix = "");
1122 
1123 public:
1124   /// Modifies the canonical loop to be a workshare loop.
1125   ///
1126   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1127   /// created by \p createCanonicalLoop and emits additional instructions to
1128   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1129   /// runtime function in the preheader to obtain the loop bounds to be used in
1130   /// the current thread, updates the relevant instructions in the canonical
1131   /// loop and calls to an OpenMP runtime finalization function after the loop.
1132   ///
1133   /// The concrete transformation is done by applyStaticWorkshareLoop,
1134   /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1135   /// on the value of \p SchedKind and \p ChunkSize.
1136   ///
1137   /// \param DL       Debug location for instructions added for the
1138   ///                 workshare-loop construct itself.
1139   /// \param CLI      A descriptor of the canonical loop to workshare.
1140   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1141   ///                 preheader of the loop.
1142   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1143   ///                     the loop.
1144   /// \param SchedKind Scheduling algorithm to use.
1145   /// \param ChunkSize The chunk size for the inner loop.
1146   /// \param HasSimdModifier Whether the simd modifier is present in the
1147   ///                        schedule clause.
1148   /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1149   ///                             the schedule clause.
1150   /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1151   ///                                present in the schedule clause.
1152   /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1153   ///                         present.
1154   /// \param LoopType Information about type of loop worksharing.
1155   ///                 It corresponds to type of loop workshare OpenMP pragma.
1156   ///
1157   /// \returns Point where to insert code after the workshare construct.
1158   LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
1159       DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1160       bool NeedsBarrier,
1161       llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1162       Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1163       bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1164       bool HasOrderedClause = false,
1165       omp::WorksharingLoopType LoopType =
1166           omp::WorksharingLoopType::ForStaticLoop);
1167 
1168   /// Tile a loop nest.
1169   ///
1170   /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1171   /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1172   /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1173   /// of every loop and every tile sizes must be usable in the outermost
1174   /// loop's preheader. This implies that the loop nest is rectangular.
1175   ///
1176   /// Example:
1177   /// \code
1178   ///   for (int i = 0; i < 15; ++i) // Canonical loop "i"
1179   ///     for (int j = 0; j < 14; ++j) // Canonical loop "j"
1180   ///         body(i, j);
1181   /// \endcode
1182   ///
1183   /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1184   /// \code
1185   ///   for (int i1 = 0; i1 < 3; ++i1)
1186   ///     for (int j1 = 0; j1 < 2; ++j1)
1187   ///       for (int i2 = 0; i2 < 5; ++i2)
1188   ///         for (int j2 = 0; j2 < 7; ++j2)
1189   ///           body(i1*3+i2, j1*3+j2);
1190   /// \endcode
1191   ///
1192   /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1193   /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1194   /// handles non-constant trip counts, non-constant tile sizes and trip counts
1195   /// that are not multiples of the tile size. In the latter case the tile loop
1196   /// of the last floor-loop iteration will have fewer iterations than specified
1197   /// as its tile size.
1198   ///
1199   ///
1200   /// @param DL        Debug location for instructions added by tiling, for
1201   ///                  instance the floor- and tile trip count computation.
1202   /// @param Loops     Loops to tile. The CanonicalLoopInfo objects are
1203   ///                  invalidated by this method, i.e. should not used after
1204   ///                  tiling.
1205   /// @param TileSizes For each loop in \p Loops, the tile size for that
1206   ///                  dimensions.
1207   ///
1208   /// \returns A list of generated loops. Contains twice as many loops as the
1209   ///          input loop nest; the first half are the floor loops and the
1210   ///          second half are the tile loops.
1211   LLVM_ABI std::vector<CanonicalLoopInfo *>
1212   tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1213             ArrayRef<Value *> TileSizes);
1214 
1215   /// Fully unroll a loop.
1216   ///
1217   /// Instead of unrolling the loop immediately (and duplicating its body
1218   /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1219   /// metadata.
1220   ///
1221   /// \param DL   Debug location for instructions added by unrolling.
1222   /// \param Loop The loop to unroll. The loop will be invalidated.
1223   LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1224 
1225   /// Fully or partially unroll a loop. How the loop is unrolled is determined
1226   /// using LLVM's LoopUnrollPass.
1227   ///
1228   /// \param DL   Debug location for instructions added by unrolling.
1229   /// \param Loop The loop to unroll. The loop will be invalidated.
1230   LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1231 
1232   /// Partially unroll a loop.
1233   ///
1234   /// The CanonicalLoopInfo of the unrolled loop for use with chained
1235   /// loop-associated directive can be requested using \p UnrolledCLI. Not
1236   /// needing the CanonicalLoopInfo allows more efficient code generation by
1237   /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1238   /// A loop-associated directive applied to the unrolled loop needs to know the
1239   /// new trip count which means that if using a heuristically determined unroll
1240   /// factor (\p Factor == 0), that factor must be computed immediately. We are
1241   /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1242   /// but which assumes that some canonicalization has taken place (e.g.
1243   /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1244   /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1245   ///
1246   /// \param DL          Debug location for instructions added by unrolling.
1247   /// \param Loop        The loop to unroll. The loop will be invalidated.
1248   /// \param Factor      The factor to unroll the loop by. A factor of 0
1249   ///                    indicates that a heuristic should be used to determine
1250   ///                    the unroll-factor.
1251   /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1252   ///                    partially unrolled loop. Otherwise, uses loop metadata
1253   ///                    to defer unrolling to the LoopUnrollPass.
1254   LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
1255                                   int32_t Factor,
1256                                   CanonicalLoopInfo **UnrolledCLI);
1257 
1258   /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1259   /// is cloned. The metadata which prevents vectorization is added to
1260   /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1261   /// to false.
1262   ///
1263   /// \param Loop        The loop to simd-ize.
1264   /// \param AlignedVars The map which containts pairs of the pointer
1265   ///                    and its corresponding alignment.
1266   /// \param IfCond      The value which corresponds to the if clause
1267   ///                    condition.
1268   /// \param Order       The enum to map order clause.
1269   /// \param Simdlen     The Simdlen length to apply to the simd loop.
1270   /// \param Safelen     The Safelen length to apply to the simd loop.
1271   LLVM_ABI void applySimd(CanonicalLoopInfo *Loop,
1272                           MapVector<Value *, Value *> AlignedVars,
1273                           Value *IfCond, omp::OrderKind Order,
1274                           ConstantInt *Simdlen, ConstantInt *Safelen);
1275 
1276   /// Generator for '#omp flush'
1277   ///
1278   /// \param Loc The location where the flush directive was encountered
1279   LLVM_ABI void createFlush(const LocationDescription &Loc);
1280 
1281   /// Generator for '#omp taskwait'
1282   ///
1283   /// \param Loc The location where the taskwait directive was encountered.
1284   LLVM_ABI void createTaskwait(const LocationDescription &Loc);
1285 
1286   /// Generator for '#omp taskyield'
1287   ///
1288   /// \param Loc The location where the taskyield directive was encountered.
1289   LLVM_ABI void createTaskyield(const LocationDescription &Loc);
1290 
1291   /// A struct to pack the relevant information for an OpenMP depend clause.
1292   struct DependData {
1293     omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1294     Type *DepValueType;
1295     Value *DepVal;
1296     explicit DependData() = default;
DependDataDependData1297     DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1298                Value *DepVal)
1299         : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1300   };
1301 
1302   /// Generator for `#omp task`
1303   ///
1304   /// \param Loc The location where the task construct was encountered.
1305   /// \param AllocaIP The insertion point to be used for alloca instructions.
1306   /// \param BodyGenCB Callback that will generate the region code.
1307   /// \param Tied True if the task is tied, false if the task is untied.
1308   /// \param Final i1 value which is `true` if the task is final, `false` if the
1309   ///              task is not final.
1310   /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1311   ///                    task is generated, and the encountering thread must
1312   ///                    suspend the current task region, for which execution
1313   ///                    cannot be resumed until execution of the structured
1314   ///                    block that is associated with the generated task is
1315   ///                    completed.
1316   /// \param EventHandle If present, signifies the event handle as part of
1317   ///			 the detach clause
1318   /// \param Mergeable	 If the given task is `mergeable`
1319   /// \param priority `priority-value' specifies the execution order of the
1320   ///                 tasks that is generated by the construct
1321   LLVM_ABI InsertPointOrErrorTy
1322   createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1323              BodyGenCallbackTy BodyGenCB, bool Tied = true,
1324              Value *Final = nullptr, Value *IfCondition = nullptr,
1325              SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1326              Value *EventHandle = nullptr, Value *Priority = nullptr);
1327 
1328   /// Generator for the taskgroup construct
1329   ///
1330   /// \param Loc The location where the taskgroup construct was encountered.
1331   /// \param AllocaIP The insertion point to be used for alloca instructions.
1332   /// \param BodyGenCB Callback that will generate the region code.
1333   LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1334                                                 InsertPointTy AllocaIP,
1335                                                 BodyGenCallbackTy BodyGenCB);
1336 
1337   using FileIdentifierInfoCallbackTy =
1338       std::function<std::tuple<std::string, uint64_t>()>;
1339 
1340   /// Creates a unique info for a target entry when provided a filename and
1341   /// line number from.
1342   ///
1343   /// \param CallBack A callback function which should return filename the entry
1344   /// resides in as well as the line number for the target entry
1345   /// \param ParentName The name of the parent the target entry resides in, if
1346   /// any.
1347   LLVM_ABI static TargetRegionEntryInfo
1348   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1349                            StringRef ParentName = "");
1350 
1351   /// Enum class for the RedctionGen CallBack type to be used.
1352   enum class ReductionGenCBKind { Clang, MLIR };
1353 
1354   /// ReductionGen CallBack for Clang
1355   ///
1356   /// \param CodeGenIP InsertPoint for CodeGen.
1357   /// \param Index Index of the ReductionInfo to generate code for.
1358   /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1359   /// codegen, used for fixup later.
1360   /// \param RHSPtr Optionally used by Clang to
1361   /// return the RHSPtr it used for codegen, used for fixup later.
1362   /// \param CurFn Optionally used by Clang to pass in the Current Function as
1363   /// Clang context may be old.
1364   using ReductionGenClangCBTy =
1365       std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1366                                   Value **LHS, Value **RHS, Function *CurFn)>;
1367 
1368   /// ReductionGen CallBack for MLIR
1369   ///
1370   /// \param CodeGenIP InsertPoint for CodeGen.
1371   /// \param LHS Pass in the LHS Value to be used for CodeGen.
1372   /// \param RHS Pass in the RHS Value to be used for CodeGen.
1373   using ReductionGenCBTy = std::function<InsertPointOrErrorTy(
1374       InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1375 
1376   /// Functions used to generate atomic reductions. Such functions take two
1377   /// Values representing pointers to LHS and RHS of the reduction, as well as
1378   /// the element type of these pointers. They are expected to atomically
1379   /// update the LHS to the reduced value.
1380   using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy(
1381       InsertPointTy, Type *, Value *, Value *)>;
1382 
1383   /// Enum class for reduction evaluation types scalar, complex and aggregate.
1384   enum class EvalKind { Scalar, Complex, Aggregate };
1385 
1386   /// Information about an OpenMP reduction.
1387   struct ReductionInfo {
ReductionInfoReductionInfo1388     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1389                   EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1390                   ReductionGenClangCBTy ReductionGenClang,
1391                   ReductionGenAtomicCBTy AtomicReductionGen)
1392         : ElementType(ElementType), Variable(Variable),
1393           PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1394           ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1395           AtomicReductionGen(AtomicReductionGen) {}
ReductionInfoReductionInfo1396     ReductionInfo(Value *PrivateVariable)
1397         : ElementType(nullptr), Variable(nullptr),
1398           PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1399           ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
1400 
1401     /// Reduction element type, must match pointee type of variable.
1402     Type *ElementType;
1403 
1404     /// Reduction variable of pointer type.
1405     Value *Variable;
1406 
1407     /// Thread-private partial reduction variable.
1408     Value *PrivateVariable;
1409 
1410     /// Reduction evaluation kind - scalar, complex or aggregate.
1411     EvalKind EvaluationKind;
1412 
1413     /// Callback for generating the reduction body. The IR produced by this will
1414     /// be used to combine two values in a thread-safe context, e.g., under
1415     /// lock or within the same thread, and therefore need not be atomic.
1416     ReductionGenCBTy ReductionGen;
1417 
1418     /// Clang callback for generating the reduction body. The IR produced by
1419     /// this will be used to combine two values in a thread-safe context, e.g.,
1420     /// under lock or within the same thread, and therefore need not be atomic.
1421     ReductionGenClangCBTy ReductionGenClang;
1422 
1423     /// Callback for generating the atomic reduction body, may be null. The IR
1424     /// produced by this will be used to atomically combine two values during
1425     /// reduction. If null, the implementation will use the non-atomic version
1426     /// along with the appropriate synchronization mechanisms.
1427     ReductionGenAtomicCBTy AtomicReductionGen;
1428   };
1429 
1430   enum class CopyAction : unsigned {
1431     // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1432     // the warp using shuffle instructions.
1433     RemoteLaneToThread,
1434     // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1435     ThreadCopy,
1436   };
1437 
1438   struct CopyOptionsTy {
1439     Value *RemoteLaneOffset = nullptr;
1440     Value *ScratchpadIndex = nullptr;
1441     Value *ScratchpadWidth = nullptr;
1442   };
1443 
1444   /// Supporting functions for Reductions CodeGen.
1445 private:
1446   /// Get the id of the current thread on the GPU.
1447   Value *getGPUThreadID();
1448 
1449   /// Get the GPU warp size.
1450   Value *getGPUWarpSize();
1451 
1452   /// Get the id of the warp in the block.
1453   /// We assume that the warp size is 32, which is always the case
1454   /// on the NVPTX device, to generate more efficient code.
1455   Value *getNVPTXWarpID();
1456 
1457   /// Get the id of the current lane in the Warp.
1458   /// We assume that the warp size is 32, which is always the case
1459   /// on the NVPTX device, to generate more efficient code.
1460   Value *getNVPTXLaneID();
1461 
1462   /// Cast value to the specified type.
1463   Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1464 
1465   /// This function creates calls to one of two shuffle functions to copy
1466   /// variables between lanes in a warp.
1467   Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1468                                       Type *ElementType, Value *Offset);
1469 
1470   /// Function to shuffle over the value from the remote lane.
1471   void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1472                        Type *ElementType, Value *Offset,
1473                        Type *ReductionArrayTy);
1474 
1475   /// Emit instructions to copy a Reduce list, which contains partially
1476   /// aggregated values, in the specified direction.
1477   void emitReductionListCopy(
1478       InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1479       ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1480       CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1481 
1482   /// Emit a helper that reduces data across two OpenMP threads (lanes)
1483   /// in the same warp.  It uses shuffle instructions to copy over data from
1484   /// a remote lane's stack.  The reduction algorithm performed is specified
1485   /// by the fourth parameter.
1486   ///
1487   /// Algorithm Versions.
1488   /// Full Warp Reduce (argument value 0):
1489   ///   This algorithm assumes that all 32 lanes are active and gathers
1490   ///   data from these 32 lanes, producing a single resultant value.
1491   /// Contiguous Partial Warp Reduce (argument value 1):
1492   ///   This algorithm assumes that only a *contiguous* subset of lanes
1493   ///   are active.  This happens for the last warp in a parallel region
1494   ///   when the user specified num_threads is not an integer multiple of
1495   ///   32.  This contiguous subset always starts with the zeroth lane.
1496   /// Partial Warp Reduce (argument value 2):
1497   ///   This algorithm gathers data from any number of lanes at any position.
1498   /// All reduced values are stored in the lowest possible lane.  The set
1499   /// of problems every algorithm addresses is a super set of those
1500   /// addressable by algorithms with a lower version number.  Overhead
1501   /// increases as algorithm version increases.
1502   ///
1503   /// Terminology
1504   /// Reduce element:
1505   ///   Reduce element refers to the individual data field with primitive
1506   ///   data types to be combined and reduced across threads.
1507   /// Reduce list:
1508   ///   Reduce list refers to a collection of local, thread-private
1509   ///   reduce elements.
1510   /// Remote Reduce list:
1511   ///   Remote Reduce list refers to a collection of remote (relative to
1512   ///   the current thread) reduce elements.
1513   ///
1514   /// We distinguish between three states of threads that are important to
1515   /// the implementation of this function.
1516   /// Alive threads:
1517   ///   Threads in a warp executing the SIMT instruction, as distinguished from
1518   ///   threads that are inactive due to divergent control flow.
1519   /// Active threads:
1520   ///   The minimal set of threads that has to be alive upon entry to this
1521   ///   function.  The computation is correct iff active threads are alive.
1522   ///   Some threads are alive but they are not active because they do not
1523   ///   contribute to the computation in any useful manner.  Turning them off
1524   ///   may introduce control flow overheads without any tangible benefits.
1525   /// Effective threads:
1526   ///   In order to comply with the argument requirements of the shuffle
1527   ///   function, we must keep all lanes holding data alive.  But at most
1528   ///   half of them perform value aggregation; we refer to this half of
1529   ///   threads as effective. The other half is simply handing off their
1530   ///   data.
1531   ///
1532   /// Procedure
1533   /// Value shuffle:
1534   ///   In this step active threads transfer data from higher lane positions
1535   ///   in the warp to lower lane positions, creating Remote Reduce list.
1536   /// Value aggregation:
1537   ///   In this step, effective threads combine their thread local Reduce list
1538   ///   with Remote Reduce list and store the result in the thread local
1539   ///   Reduce list.
1540   /// Value copy:
1541   ///   In this step, we deal with the assumption made by algorithm 2
1542   ///   (i.e. contiguity assumption).  When we have an odd number of lanes
1543   ///   active, say 2k+1, only k threads will be effective and therefore k
1544   ///   new values will be produced.  However, the Reduce list owned by the
1545   ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
1546   ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1547   ///   that the contiguity assumption still holds.
1548   ///
1549   /// \param ReductionInfos Array type containing the ReductionOps.
1550   /// \param ReduceFn The reduction function.
1551   /// \param FuncAttrs Optional param to specify any function attributes that
1552   ///                  need to be copied to the new function.
1553   ///
1554   /// \return The ShuffleAndReduce function.
1555   Function *emitShuffleAndReduceFunction(
1556       ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1557       Function *ReduceFn, AttributeList FuncAttrs);
1558 
1559   /// This function emits a helper that gathers Reduce lists from the first
1560   /// lane of every active warp to lanes in the first warp.
1561   ///
1562   /// void inter_warp_copy_func(void* reduce_data, num_warps)
1563   ///   shared smem[warp_size];
1564   ///   For all data entries D in reduce_data:
1565   ///     sync
1566   ///     If (I am the first lane in each warp)
1567   ///       Copy my local D to smem[warp_id]
1568   ///     sync
1569   ///     if (I am the first warp)
1570   ///       Copy smem[thread_id] to my local D
1571   ///
1572   /// \param Loc The insert and source location description.
1573   /// \param ReductionInfos Array type containing the ReductionOps.
1574   /// \param FuncAttrs Optional param to specify any function attributes that
1575   ///                  need to be copied to the new function.
1576   ///
1577   /// \return The InterWarpCopy function.
1578   Expected<Function *>
1579   emitInterWarpCopyFunction(const LocationDescription &Loc,
1580                             ArrayRef<ReductionInfo> ReductionInfos,
1581                             AttributeList FuncAttrs);
1582 
1583   /// This function emits a helper that copies all the reduction variables from
1584   /// the team into the provided global buffer for the reduction variables.
1585   ///
1586   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1587   ///   For all data entries D in reduce_data:
1588   ///     Copy local D to buffer.D[Idx]
1589   ///
1590   /// \param ReductionInfos Array type containing the ReductionOps.
1591   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1592   /// \param FuncAttrs Optional param to specify any function attributes that
1593   ///                  need to be copied to the new function.
1594   ///
1595   /// \return The ListToGlobalCopy function.
1596   Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1597                                          Type *ReductionsBufferTy,
1598                                          AttributeList FuncAttrs);
1599 
1600   /// This function emits a helper that copies all the reduction variables from
1601   /// the team into the provided global buffer for the reduction variables.
1602   ///
1603   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1604   ///   For all data entries D in reduce_data:
1605   ///     Copy buffer.D[Idx] to local D;
1606   ///
1607   /// \param ReductionInfos Array type containing the ReductionOps.
1608   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1609   /// \param FuncAttrs Optional param to specify any function attributes that
1610   ///                  need to be copied to the new function.
1611   ///
1612   /// \return The GlobalToList function.
1613   Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1614                                          Type *ReductionsBufferTy,
1615                                          AttributeList FuncAttrs);
1616 
1617   /// This function emits a helper that reduces all the reduction variables from
1618   /// the team into the provided global buffer for the reduction variables.
1619   ///
1620   /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1621   ///  void *GlobPtrs[];
1622   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1623   ///  ...
1624   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1625   ///  reduce_function(GlobPtrs, reduce_data);
1626   ///
1627   /// \param ReductionInfos Array type containing the ReductionOps.
1628   /// \param ReduceFn The reduction function.
1629   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1630   /// \param FuncAttrs Optional param to specify any function attributes that
1631   ///                  need to be copied to the new function.
1632   ///
1633   /// \return The ListToGlobalReduce function.
1634   Function *
1635   emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1636                                  Function *ReduceFn, Type *ReductionsBufferTy,
1637                                  AttributeList FuncAttrs);
1638 
1639   /// This function emits a helper that reduces all the reduction variables from
1640   /// the team into the provided global buffer for the reduction variables.
1641   ///
1642   /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1643   ///  void *GlobPtrs[];
1644   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1645   ///  ...
1646   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1647   ///  reduce_function(reduce_data, GlobPtrs);
1648   ///
1649   /// \param ReductionInfos Array type containing the ReductionOps.
1650   /// \param ReduceFn The reduction function.
1651   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1652   /// \param FuncAttrs Optional param to specify any function attributes that
1653   ///                  need to be copied to the new function.
1654   ///
1655   /// \return The GlobalToListReduce function.
1656   Function *
1657   emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1658                                  Function *ReduceFn, Type *ReductionsBufferTy,
1659                                  AttributeList FuncAttrs);
1660 
1661   /// Get the function name of a reduction function.
1662   std::string getReductionFuncName(StringRef Name) const;
1663 
1664   /// Emits reduction function.
1665   /// \param ReducerName Name of the function calling the reduction.
1666   /// \param ReductionInfos Array type containing the ReductionOps.
1667   /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1668   ///                           CodeGenCB kind.
1669   /// \param FuncAttrs Optional param to specify any function attributes that
1670   ///                  need to be copied to the new function.
1671   ///
1672   /// \return The reduction function.
1673   Expected<Function *> createReductionFunction(
1674       StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1675       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1676       AttributeList FuncAttrs = {});
1677 
1678 public:
1679   ///
1680   /// Design of OpenMP reductions on the GPU
1681   ///
1682   /// Consider a typical OpenMP program with one or more reduction
1683   /// clauses:
1684   ///
1685   /// float foo;
1686   /// double bar;
1687   /// #pragma omp target teams distribute parallel for \
1688   ///             reduction(+:foo) reduction(*:bar)
1689   /// for (int i = 0; i < N; i++) {
1690   ///   foo += A[i]; bar *= B[i];
1691   /// }
1692   ///
1693   /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1694   /// all teams.  In our OpenMP implementation on the NVPTX device an
1695   /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1696   /// within a team are mapped to CUDA threads within a threadblock.
1697   /// Our goal is to efficiently aggregate values across all OpenMP
1698   /// threads such that:
1699   ///
1700   ///   - the compiler and runtime are logically concise, and
1701   ///   - the reduction is performed efficiently in a hierarchical
1702   ///     manner as follows: within OpenMP threads in the same warp,
1703   ///     across warps in a threadblock, and finally across teams on
1704   ///     the NVPTX device.
1705   ///
1706   /// Introduction to Decoupling
1707   ///
1708   /// We would like to decouple the compiler and the runtime so that the
1709   /// latter is ignorant of the reduction variables (number, data types)
1710   /// and the reduction operators.  This allows a simpler interface
1711   /// and implementation while still attaining good performance.
1712   ///
1713   /// Pseudocode for the aforementioned OpenMP program generated by the
1714   /// compiler is as follows:
1715   ///
1716   /// 1. Create private copies of reduction variables on each OpenMP
1717   ///    thread: 'foo_private', 'bar_private'
1718   /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1719   ///    to it and writes the result in 'foo_private' and 'bar_private'
1720   ///    respectively.
1721   /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1722   ///    and store the result on the team master:
1723   ///
1724   ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1725   ///        reduceData, shuffleReduceFn, interWarpCpyFn)
1726   ///
1727   ///     where:
1728   ///       struct ReduceData {
1729   ///         double *foo;
1730   ///         double *bar;
1731   ///       } reduceData
1732   ///       reduceData.foo = &foo_private
1733   ///       reduceData.bar = &bar_private
1734   ///
1735   ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1736   ///     auxiliary functions generated by the compiler that operate on
1737   ///     variables of type 'ReduceData'.  They aid the runtime perform
1738   ///     algorithmic steps in a data agnostic manner.
1739   ///
1740   ///     'shuffleReduceFn' is a pointer to a function that reduces data
1741   ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
1742   ///     same warp.  It takes the following arguments as input:
1743   ///
1744   ///     a. variable of type 'ReduceData' on the calling lane,
1745   ///     b. its lane_id,
1746   ///     c. an offset relative to the current lane_id to generate a
1747   ///        remote_lane_id.  The remote lane contains the second
1748   ///        variable of type 'ReduceData' that is to be reduced.
1749   ///     d. an algorithm version parameter determining which reduction
1750   ///        algorithm to use.
1751   ///
1752   ///     'shuffleReduceFn' retrieves data from the remote lane using
1753   ///     efficient GPU shuffle intrinsics and reduces, using the
1754   ///     algorithm specified by the 4th parameter, the two operands
1755   ///     element-wise.  The result is written to the first operand.
1756   ///
1757   ///     Different reduction algorithms are implemented in different
1758   ///     runtime functions, all calling 'shuffleReduceFn' to perform
1759   ///     the essential reduction step.  Therefore, based on the 4th
1760   ///     parameter, this function behaves slightly differently to
1761   ///     cooperate with the runtime to ensure correctness under
1762   ///     different circumstances.
1763   ///
1764   ///     'InterWarpCpyFn' is a pointer to a function that transfers
1765   ///     reduced variables across warps.  It tunnels, through CUDA
1766   ///     shared memory, the thread-private data of type 'ReduceData'
1767   ///     from lane 0 of each warp to a lane in the first warp.
1768   /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1769   ///    The last team writes the global reduced value to memory.
1770   ///
1771   ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
1772   ///             reduceData, shuffleReduceFn, interWarpCpyFn,
1773   ///             scratchpadCopyFn, loadAndReduceFn)
1774   ///
1775   ///     'scratchpadCopyFn' is a helper that stores reduced
1776   ///     data from the team master to a scratchpad array in
1777   ///     global memory.
1778   ///
1779   ///     'loadAndReduceFn' is a helper that loads data from
1780   ///     the scratchpad array and reduces it with the input
1781   ///     operand.
1782   ///
1783   ///     These compiler generated functions hide address
1784   ///     calculation and alignment information from the runtime.
1785   /// 5. if ret == 1:
1786   ///     The team master of the last team stores the reduced
1787   ///     result to the globals in memory.
1788   ///     foo += reduceData.foo; bar *= reduceData.bar
1789   ///
1790   ///
1791   /// Warp Reduction Algorithms
1792   ///
1793   /// On the warp level, we have three algorithms implemented in the
1794   /// OpenMP runtime depending on the number of active lanes:
1795   ///
1796   /// Full Warp Reduction
1797   ///
1798   /// The reduce algorithm within a warp where all lanes are active
1799   /// is implemented in the runtime as follows:
1800   ///
1801   /// full_warp_reduce(void *reduce_data,
1802   ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1803   ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1804   ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
1805   /// }
1806   ///
1807   /// The algorithm completes in log(2, WARPSIZE) steps.
1808   ///
1809   /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1810   /// not used therefore we save instructions by not retrieving lane_id
1811   /// from the corresponding special registers.  The 4th parameter, which
1812   /// represents the version of the algorithm being used, is set to 0 to
1813   /// signify full warp reduction.
1814   ///
1815   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1816   ///
1817   /// #reduce_elem refers to an element in the local lane's data structure
1818   /// #remote_elem is retrieved from a remote lane
1819   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1820   /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1821   ///
1822   /// Contiguous Partial Warp Reduction
1823   ///
1824   /// This reduce algorithm is used within a warp where only the first
1825   /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
1826   /// number of OpenMP threads in a parallel region is not a multiple of
1827   /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
1828   ///
1829   /// void
1830   /// contiguous_partial_reduce(void *reduce_data,
1831   ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
1832   ///                           int size, int lane_id) {
1833   ///   int curr_size;
1834   ///   int offset;
1835   ///   curr_size = size;
1836   ///   mask = curr_size/2;
1837   ///   while (offset>0) {
1838   ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1839   ///     curr_size = (curr_size+1)/2;
1840   ///     offset = curr_size/2;
1841   ///   }
1842   /// }
1843   ///
1844   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1845   ///
1846   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1847   /// if (lane_id < offset)
1848   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1849   /// else
1850   ///     reduce_elem = remote_elem
1851   ///
1852   /// This algorithm assumes that the data to be reduced are located in a
1853   /// contiguous subset of lanes starting from the first.  When there is
1854   /// an odd number of active lanes, the data in the last lane is not
1855   /// aggregated with any other lane's dat but is instead copied over.
1856   ///
1857   /// Dispersed Partial Warp Reduction
1858   ///
1859   /// This algorithm is used within a warp when any discontiguous subset of
1860   /// lanes are active.  It is used to implement the reduction operation
1861   /// across lanes in an OpenMP simd region or in a nested parallel region.
1862   ///
1863   /// void
1864   /// dispersed_partial_reduce(void *reduce_data,
1865   ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1866   ///   int size, remote_id;
1867   ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
1868   ///   do {
1869   ///       remote_id = next_active_lane_id_right_after_me();
1870   ///       # the above function returns 0 of no active lane
1871   ///       # is present right after the current lane.
1872   ///       size = number_of_active_lanes_in_this_warp();
1873   ///       logical_lane_id /= 2;
1874   ///       ShuffleReduceFn(reduce_data, logical_lane_id,
1875   ///                       remote_id-1-threadIdx.x, 2);
1876   ///   } while (logical_lane_id % 2 == 0 && size > 1);
1877   /// }
1878   ///
1879   /// There is no assumption made about the initial state of the reduction.
1880   /// Any number of lanes (>=1) could be active at any position.  The reduction
1881   /// result is returned in the first active lane.
1882   ///
1883   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1884   ///
1885   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1886   /// if (lane_id % 2 == 0 && offset > 0)
1887   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1888   /// else
1889   ///     reduce_elem = remote_elem
1890   ///
1891   ///
1892   /// Intra-Team Reduction
1893   ///
1894   /// This function, as implemented in the runtime call
1895   /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1896   /// threads in a team.  It first reduces within a warp using the
1897   /// aforementioned algorithms.  We then proceed to gather all such
1898   /// reduced values at the first warp.
1899   ///
1900   /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1901   /// data from each of the "warp master" (zeroth lane of each warp, where
1902   /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
1903   /// a mathematical sense) the problem of reduction across warp masters in
1904   /// a block to the problem of warp reduction.
1905   ///
1906   ///
1907   /// Inter-Team Reduction
1908   ///
1909   /// Once a team has reduced its data to a single value, it is stored in
1910   /// a global scratchpad array.  Since each team has a distinct slot, this
1911   /// can be done without locking.
1912   ///
1913   /// The last team to write to the scratchpad array proceeds to reduce the
1914   /// scratchpad array.  One or more workers in the last team use the helper
1915   /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1916   /// the k'th worker reduces every k'th element.
1917   ///
1918   /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1919   /// reduce across workers and compute a globally reduced value.
1920   ///
1921   /// \param Loc                The location where the reduction was
1922   ///                           encountered. Must be within the associate
1923   ///                           directive and after the last local access to the
1924   ///                           reduction variables.
1925   /// \param AllocaIP           An insertion point suitable for allocas usable
1926   ///                           in reductions.
1927   /// \param CodeGenIP           An insertion point suitable for code
1928   /// generation. \param ReductionInfos     A list of info on each reduction
1929   /// variable. \param IsNoWait           Optional flag set if the reduction is
1930   /// marked as
1931   ///                           nowait.
1932   /// \param IsTeamsReduction   Optional flag set if it is a teams
1933   ///                           reduction.
1934   /// \param GridValue          Optional GPU grid value.
1935   /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
1936   /// used for teams reduction.
1937   /// \param SrcLocInfo         Source location information global.
1938   LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
1939       const LocationDescription &Loc, InsertPointTy AllocaIP,
1940       InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
1941       bool IsNoWait = false, bool IsTeamsReduction = false,
1942       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1943       std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
1944       Value *SrcLocInfo = nullptr);
1945 
1946   // TODO: provide atomic and non-atomic reduction generators for reduction
1947   // operators defined by the OpenMP specification.
1948 
1949   /// Generator for '#omp reduction'.
1950   ///
1951   /// Emits the IR instructing the runtime to perform the specific kind of
1952   /// reductions. Expects reduction variables to have been privatized and
1953   /// initialized to reduction-neutral values separately. Emits the calls to
1954   /// runtime functions as well as the reduction function and the basic blocks
1955   /// performing the reduction atomically and non-atomically.
1956   ///
1957   /// The code emitted for the following:
1958   ///
1959   /// \code
1960   ///   type var_1;
1961   ///   type var_2;
1962   ///   #pragma omp <directive> reduction(reduction-op:var_1,var_2)
1963   ///   /* body */;
1964   /// \endcode
1965   ///
1966   /// corresponds to the following sketch.
1967   ///
1968   /// \code
1969   /// void _outlined_par() {
1970   ///   // N is the number of different reductions.
1971   ///   void *red_array[] = {privatized_var_1, privatized_var_2, ...};
1972   ///   switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
1973   ///                        _omp_reduction_func,
1974   ///                        _gomp_critical_user.reduction.var)) {
1975   ///   case 1: {
1976   ///     var_1 = var_1 <reduction-op> privatized_var_1;
1977   ///     var_2 = var_2 <reduction-op> privatized_var_2;
1978   ///     // ...
1979   ///    __kmpc_end_reduce(...);
1980   ///     break;
1981   ///   }
1982   ///   case 2: {
1983   ///     _Atomic<ReductionOp>(var_1, privatized_var_1);
1984   ///     _Atomic<ReductionOp>(var_2, privatized_var_2);
1985   ///     // ...
1986   ///     break;
1987   ///   }
1988   ///   default: break;
1989   ///   }
1990   /// }
1991   ///
1992   /// void _omp_reduction_func(void **lhs, void **rhs) {
1993   ///   *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
1994   ///   *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
1995   ///   // ...
1996   /// }
1997   /// \endcode
1998   ///
1999   /// \param Loc                The location where the reduction was
2000   ///                           encountered. Must be within the associate
2001   ///                           directive and after the last local access to the
2002   ///                           reduction variables.
2003   /// \param AllocaIP           An insertion point suitable for allocas usable
2004   ///                           in reductions.
2005   /// \param ReductionInfos     A list of info on each reduction variable.
2006   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
2007   /// \param IsByRef            A flag set if the reduction is using reference
2008   /// or direct value.
2009   /// \param IsTeamsReduction   Optional flag set if it is a teams
2010   ///                           reduction.
2011   LLVM_ABI InsertPointOrErrorTy createReductions(
2012       const LocationDescription &Loc, InsertPointTy AllocaIP,
2013       ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
2014       bool IsNoWait = false, bool IsTeamsReduction = false);
2015 
2016   ///}
2017 
2018   /// Return the insertion point used by the underlying IRBuilder.
getInsertionPoint()2019   InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
2020 
2021   /// Update the internal location to \p Loc.
updateToLocation(const LocationDescription & Loc)2022   bool updateToLocation(const LocationDescription &Loc) {
2023     Builder.restoreIP(Loc.IP);
2024     Builder.SetCurrentDebugLocation(Loc.DL);
2025     return Loc.IP.getBlock() != nullptr;
2026   }
2027 
2028   /// Return the function declaration for the runtime function with \p FnID.
2029   LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M,
2030                                                      omp::RuntimeFunction FnID);
2031 
2032   LLVM_ABI Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
2033 
2034   /// Return the (LLVM-IR) string describing the source location \p LocStr.
2035   LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef LocStr,
2036                                           uint32_t &SrcLocStrSize);
2037 
2038   /// Return the (LLVM-IR) string describing the default source location.
2039   LLVM_ABI Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
2040 
2041   /// Return the (LLVM-IR) string describing the source location identified by
2042   /// the arguments.
2043   LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef FunctionName,
2044                                           StringRef FileName, unsigned Line,
2045                                           unsigned Column,
2046                                           uint32_t &SrcLocStrSize);
2047 
2048   /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
2049   /// fallback if \p DL does not specify the function name.
2050   LLVM_ABI Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
2051                                           Function *F = nullptr);
2052 
2053   /// Return the (LLVM-IR) string describing the source location \p Loc.
2054   LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2055                                           uint32_t &SrcLocStrSize);
2056 
2057   /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2058   /// TODO: Create a enum class for the Reserve2Flags
2059   LLVM_ABI Constant *getOrCreateIdent(Constant *SrcLocStr,
2060                                       uint32_t SrcLocStrSize,
2061                                       omp::IdentFlag Flags = omp::IdentFlag(0),
2062                                       unsigned Reserve2Flags = 0);
2063 
2064   /// Create a hidden global flag \p Name in the module with initial value \p
2065   /// Value.
2066   LLVM_ABI GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
2067 
2068   /// Emit the llvm.used metadata.
2069   LLVM_ABI void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
2070 
2071   /// Emit the kernel execution mode.
2072   LLVM_ABI GlobalVariable *
2073   emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode);
2074 
2075   /// Generate control flow and cleanup for cancellation.
2076   ///
2077   /// \param CancelFlag Flag indicating if the cancellation is performed.
2078   /// \param CanceledDirective The kind of directive that is cancled.
2079   /// \param ExitCB Extra code to be generated in the exit block.
2080   ///
2081   /// \return an error, if any were triggered during execution.
2082   LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
2083                                           omp::Directive CanceledDirective,
2084                                           FinalizeCallbackTy ExitCB = {});
2085 
2086   /// Generate a target region entry call.
2087   ///
2088   /// \param Loc The location at which the request originated and is fulfilled.
2089   /// \param AllocaIP The insertion point to be used for alloca instructions.
2090   /// \param Return Return value of the created function returned by reference.
2091   /// \param DeviceID Identifier for the device via the 'device' clause.
2092   /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2093   ///                 or 0 if unspecified and -1 if there is no 'teams' clause.
2094   /// \param NumThreads Number of threads via the 'thread_limit' clause.
2095   /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2096   /// \param KernelArgs Array of arguments to the kernel.
2097   LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2098                                           InsertPointTy AllocaIP,
2099                                           Value *&Return, Value *Ident,
2100                                           Value *DeviceID, Value *NumTeams,
2101                                           Value *NumThreads, Value *HostPtr,
2102                                           ArrayRef<Value *> KernelArgs);
2103 
2104   /// Generate a flush runtime call.
2105   ///
2106   /// \param Loc The location at which the request originated and is fulfilled.
2107   LLVM_ABI void emitFlush(const LocationDescription &Loc);
2108 
2109   /// The finalization stack made up of finalize callbacks currently in-flight,
2110   /// wrapped into FinalizationInfo objects that reference also the finalization
2111   /// target block and the kind of cancellable directive.
2112   SmallVector<FinalizationInfo, 8> FinalizationStack;
2113 
2114   /// Return true if the last entry in the finalization stack is of kind \p DK
2115   /// and cancellable.
isLastFinalizationInfoCancellable(omp::Directive DK)2116   bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2117     return !FinalizationStack.empty() &&
2118            FinalizationStack.back().IsCancellable &&
2119            FinalizationStack.back().DK == DK;
2120   }
2121 
2122   /// Generate a taskwait runtime call.
2123   ///
2124   /// \param Loc The location at which the request originated and is fulfilled.
2125   LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc);
2126 
2127   /// Generate a taskyield runtime call.
2128   ///
2129   /// \param Loc The location at which the request originated and is fulfilled.
2130   LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc);
2131 
2132   /// Return the current thread ID.
2133   ///
2134   /// \param Ident The ident (ident_t*) describing the query origin.
2135   LLVM_ABI Value *getOrCreateThreadID(Value *Ident);
2136 
2137   /// The OpenMPIRBuilder Configuration
2138   OpenMPIRBuilderConfig Config;
2139 
2140   /// The underlying LLVM-IR module
2141   Module &M;
2142 
2143   /// The LLVM-IR Builder used to create IR.
2144   IRBuilder<> Builder;
2145 
2146   /// Map to remember source location strings
2147   StringMap<Constant *> SrcLocStrMap;
2148 
2149   /// Map to remember existing ident_t*.
2150   DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2151 
2152   /// Info manager to keep track of target regions.
2153   OffloadEntriesInfoManager OffloadInfoManager;
2154 
2155   /// The target triple of the underlying module.
2156   const Triple T;
2157 
2158   /// Helper that contains information about regions we need to outline
2159   /// during finalization.
2160   struct OutlineInfo {
2161     using PostOutlineCBTy = std::function<void(Function &)>;
2162     PostOutlineCBTy PostOutlineCB;
2163     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2164     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2165 
2166     /// Collect all blocks in between EntryBB and ExitBB in both the given
2167     /// vector and set.
2168     LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2169                                 SmallVectorImpl<BasicBlock *> &BlockVector);
2170 
2171     /// Return the function that contains the region to be outlined.
getFunctionOutlineInfo2172     Function *getFunction() const { return EntryBB->getParent(); }
2173   };
2174 
2175   /// Collection of regions that need to be outlined during finalization.
2176   SmallVector<OutlineInfo, 16> OutlineInfos;
2177 
2178   /// A collection of candidate target functions that's constant allocas will
2179   /// attempt to be raised on a call of finalize after all currently enqueued
2180   /// outline info's have been processed.
2181   SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2182 
2183   /// Collection of owned canonical loop objects that eventually need to be
2184   /// free'd.
2185   std::forward_list<CanonicalLoopInfo> LoopInfos;
2186 
2187   /// Add a new region that will be outlined later.
addOutlineInfo(OutlineInfo && OI)2188   void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2189 
2190   /// An ordered map of auto-generated variables to their unique names.
2191   /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2192   /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2193   /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2194   /// variables.
2195   StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2196 
2197   /// Computes the size of type in bytes.
2198   LLVM_ABI Value *getSizeInBytes(Value *BasePtr);
2199 
2200   // Emit a branch from the current block to the Target block only if
2201   // the current block has a terminator.
2202   LLVM_ABI void emitBranch(BasicBlock *Target);
2203 
2204   // If BB has no use then delete it and return. Else place BB after the current
2205   // block, if possible, or else at the end of the function. Also add a branch
2206   // from current block to BB if current block does not have a terminator.
2207   LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn,
2208                           bool IsFinished = false);
2209 
2210   /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2211   /// Here is the logic:
2212   /// if (Cond) {
2213   ///   ThenGen();
2214   /// } else {
2215   ///   ElseGen();
2216   /// }
2217   ///
2218   /// \return an error, if any were triggered during execution.
2219   LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2220                               BodyGenCallbackTy ElseGen,
2221                               InsertPointTy AllocaIP = {});
2222 
2223   /// Create the global variable holding the offload mappings information.
2224   LLVM_ABI GlobalVariable *
2225   createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2226                         std::string VarName);
2227 
2228   /// Create the global variable holding the offload names information.
2229   LLVM_ABI GlobalVariable *
2230   createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2231                         std::string VarName);
2232 
2233   struct MapperAllocas {
2234     AllocaInst *ArgsBase = nullptr;
2235     AllocaInst *Args = nullptr;
2236     AllocaInst *ArgSizes = nullptr;
2237   };
2238 
2239   /// Create the allocas instruction used in call to mapper functions.
2240   LLVM_ABI void createMapperAllocas(const LocationDescription &Loc,
2241                                     InsertPointTy AllocaIP,
2242                                     unsigned NumOperands,
2243                                     struct MapperAllocas &MapperAllocas);
2244 
2245   /// Create the call for the target mapper function.
2246   /// \param Loc The source location description.
2247   /// \param MapperFunc Function to be called.
2248   /// \param SrcLocInfo Source location information global.
2249   /// \param MaptypesArg The argument types.
2250   /// \param MapnamesArg The argument names.
2251   /// \param MapperAllocas The AllocaInst used for the call.
2252   /// \param DeviceID Device ID for the call.
2253   /// \param NumOperands Number of operands in the call.
2254   LLVM_ABI void emitMapperCall(const LocationDescription &Loc,
2255                                Function *MapperFunc, Value *SrcLocInfo,
2256                                Value *MaptypesArg, Value *MapnamesArg,
2257                                struct MapperAllocas &MapperAllocas,
2258                                int64_t DeviceID, unsigned NumOperands);
2259 
2260   /// Container for the arguments used to pass data to the runtime library.
2261   struct TargetDataRTArgs {
2262     /// The array of base pointer passed to the runtime library.
2263     Value *BasePointersArray = nullptr;
2264     /// The array of section pointers passed to the runtime library.
2265     Value *PointersArray = nullptr;
2266     /// The array of sizes passed to the runtime library.
2267     Value *SizesArray = nullptr;
2268     /// The array of map types passed to the runtime library for the beginning
2269     /// of the region or for the entire region if there are no separate map
2270     /// types for the region end.
2271     Value *MapTypesArray = nullptr;
2272     /// The array of map types passed to the runtime library for the end of the
2273     /// region, or nullptr if there are no separate map types for the region
2274     /// end.
2275     Value *MapTypesArrayEnd = nullptr;
2276     /// The array of user-defined mappers passed to the runtime library.
2277     Value *MappersArray = nullptr;
2278     /// The array of original declaration names of mapped pointers sent to the
2279     /// runtime library for debugging
2280     Value *MapNamesArray = nullptr;
2281 
TargetDataRTArgsTargetDataRTArgs2282     explicit TargetDataRTArgs() {}
TargetDataRTArgsTargetDataRTArgs2283     explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2284                               Value *SizesArray, Value *MapTypesArray,
2285                               Value *MapTypesArrayEnd, Value *MappersArray,
2286                               Value *MapNamesArray)
2287         : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2288           SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2289           MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2290           MapNamesArray(MapNamesArray) {}
2291   };
2292 
2293   /// Container to pass the default attributes with which a kernel must be
2294   /// launched, used to set kernel attributes and populate associated static
2295   /// structures.
2296   ///
2297   /// For max values, < 0 means unset, == 0 means set but unknown at compile
2298   /// time. The number of max values will be 1 except for the case where
2299   /// ompx_bare is set.
2300   struct TargetKernelDefaultAttrs {
2301     omp::OMPTgtExecModeFlags ExecFlags =
2302         omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2303     SmallVector<int32_t, 3> MaxTeams = {-1};
2304     int32_t MinTeams = 1;
2305     SmallVector<int32_t, 3> MaxThreads = {-1};
2306     int32_t MinThreads = 1;
2307     int32_t ReductionDataSize = 0;
2308     int32_t ReductionBufferLength = 0;
2309   };
2310 
2311   /// Container to pass LLVM IR runtime values or constants related to the
2312   /// number of teams and threads with which the kernel must be launched, as
2313   /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2314   /// kernel. These must be defined in the host prior to the call to the kernel
2315   /// launch OpenMP RTL function.
2316   struct TargetKernelRuntimeAttrs {
2317     SmallVector<Value *, 3> MaxTeams = {nullptr};
2318     Value *MinTeams = nullptr;
2319     SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
2320     SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
2321 
2322     /// 'parallel' construct 'num_threads' clause value, if present and it is an
2323     /// SPMD kernel.
2324     Value *MaxThreads = nullptr;
2325 
2326     /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2327     /// it is a generic kernel.
2328     Value *LoopTripCount = nullptr;
2329   };
2330 
2331   /// Data structure that contains the needed information to construct the
2332   /// kernel args vector.
2333   struct TargetKernelArgs {
2334     /// Number of arguments passed to the runtime library.
2335     unsigned NumTargetItems = 0;
2336     /// Arguments passed to the runtime library
2337     TargetDataRTArgs RTArgs;
2338     /// The number of iterations
2339     Value *NumIterations = nullptr;
2340     /// The number of teams.
2341     ArrayRef<Value *> NumTeams;
2342     /// The number of threads.
2343     ArrayRef<Value *> NumThreads;
2344     /// The size of the dynamic shared memory.
2345     Value *DynCGGroupMem = nullptr;
2346     /// True if the kernel has 'no wait' clause.
2347     bool HasNoWait = false;
2348 
2349     // Constructors for TargetKernelArgs.
TargetKernelArgsTargetKernelArgs2350     TargetKernelArgs() {}
TargetKernelArgsTargetKernelArgs2351     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2352                      Value *NumIterations, ArrayRef<Value *> NumTeams,
2353                      ArrayRef<Value *> NumThreads, Value *DynCGGroupMem,
2354                      bool HasNoWait)
2355         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2356           NumIterations(NumIterations), NumTeams(NumTeams),
2357           NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
2358           HasNoWait(HasNoWait) {}
2359   };
2360 
2361   /// Create the kernel args vector used by emitTargetKernel. This function
2362   /// creates various constant values that are used in the resulting args
2363   /// vector.
2364   LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2365                                            IRBuilderBase &Builder,
2366                                            SmallVector<Value *> &ArgsVector);
2367 
2368   /// Struct that keeps the information that should be kept throughout
2369   /// a 'target data' region.
2370   class TargetDataInfo {
2371     /// Set to true if device pointer information have to be obtained.
2372     bool RequiresDevicePointerInfo = false;
2373     /// Set to true if Clang emits separate runtime calls for the beginning and
2374     /// end of the region.  These calls might have separate map type arrays.
2375     bool SeparateBeginEndCalls = false;
2376 
2377   public:
2378     TargetDataRTArgs RTArgs;
2379 
2380     SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2381         DevicePtrInfoMap;
2382 
2383     /// Indicate whether any user-defined mapper exists.
2384     bool HasMapper = false;
2385     /// The total number of pointers passed to the runtime library.
2386     unsigned NumberOfPtrs = 0u;
2387 
2388     bool EmitDebug = false;
2389 
2390     /// Whether the `target ... data` directive has a `nowait` clause.
2391     bool HasNoWait = false;
2392 
TargetDataInfo()2393     explicit TargetDataInfo() {}
TargetDataInfo(bool RequiresDevicePointerInfo,bool SeparateBeginEndCalls)2394     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2395                             bool SeparateBeginEndCalls)
2396         : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2397           SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2398     /// Clear information about the data arrays.
clearArrayInfo()2399     void clearArrayInfo() {
2400       RTArgs = TargetDataRTArgs();
2401       HasMapper = false;
2402       NumberOfPtrs = 0u;
2403     }
2404     /// Return true if the current target data information has valid arrays.
isValid()2405     bool isValid() {
2406       return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2407              RTArgs.SizesArray && RTArgs.MapTypesArray &&
2408              (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2409     }
requiresDevicePointerInfo()2410     bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
separateBeginEndCalls()2411     bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2412   };
2413 
2414   enum class DeviceInfoTy { None, Pointer, Address };
2415   using MapValuesArrayTy = SmallVector<Value *, 4>;
2416   using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2417   using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2418   using MapNamesArrayTy = SmallVector<Constant *, 4>;
2419   using MapDimArrayTy = SmallVector<uint64_t, 4>;
2420   using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2421 
2422   /// This structure contains combined information generated for mappable
2423   /// clauses, including base pointers, pointers, sizes, map types, user-defined
2424   /// mappers, and non-contiguous information.
2425   struct MapInfosTy {
2426     struct StructNonContiguousInfo {
2427       bool IsNonContiguous = false;
2428       MapDimArrayTy Dims;
2429       MapNonContiguousArrayTy Offsets;
2430       MapNonContiguousArrayTy Counts;
2431       MapNonContiguousArrayTy Strides;
2432     };
2433     MapValuesArrayTy BasePointers;
2434     MapValuesArrayTy Pointers;
2435     MapDeviceInfoArrayTy DevicePointers;
2436     MapValuesArrayTy Sizes;
2437     MapFlagsArrayTy Types;
2438     MapNamesArrayTy Names;
2439     StructNonContiguousInfo NonContigInfo;
2440 
2441     /// Append arrays in \a CurInfo.
appendMapInfosTy2442     void append(MapInfosTy &CurInfo) {
2443       BasePointers.append(CurInfo.BasePointers.begin(),
2444                           CurInfo.BasePointers.end());
2445       Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2446       DevicePointers.append(CurInfo.DevicePointers.begin(),
2447                             CurInfo.DevicePointers.end());
2448       Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2449       Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2450       Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2451       NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2452                                 CurInfo.NonContigInfo.Dims.end());
2453       NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2454                                    CurInfo.NonContigInfo.Offsets.end());
2455       NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2456                                   CurInfo.NonContigInfo.Counts.end());
2457       NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2458                                    CurInfo.NonContigInfo.Strides.end());
2459     }
2460   };
2461   using MapInfosOrErrorTy = Expected<MapInfosTy &>;
2462 
2463   /// Callback function type for functions emitting the host fallback code that
2464   /// is executed when the kernel launch fails. It takes an insertion point as
2465   /// parameter where the code should be emitted. It returns an insertion point
2466   /// that points right after after the emitted code.
2467   using EmitFallbackCallbackTy =
2468       function_ref<InsertPointOrErrorTy(InsertPointTy)>;
2469 
2470   // Callback function type for emitting and fetching user defined custom
2471   // mappers.
2472   using CustomMapperCallbackTy =
2473       function_ref<Expected<Function *>(unsigned int)>;
2474 
2475   /// Generate a target region entry call and host fallback call.
2476   ///
2477   /// \param Loc The location at which the request originated and is fulfilled.
2478   /// \param OutlinedFnID The ooulined function ID.
2479   /// \param EmitTargetCallFallbackCB Call back function to generate host
2480   ///        fallback code.
2481   /// \param Args Data structure holding information about the kernel arguments.
2482   /// \param DeviceID Identifier for the device via the 'device' clause.
2483   /// \param RTLoc Source location identifier
2484   /// \param AllocaIP The insertion point to be used for alloca instructions.
2485   LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(
2486       const LocationDescription &Loc, Value *OutlinedFnID,
2487       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2488       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2489 
2490   /// Callback type for generating the bodies of device directives that require
2491   /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2492   ///
2493   /// \param DeviceID The ID of the device on which the target region will
2494   ///        execute.
2495   /// \param RTLoc Source location identifier
2496   /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2497   ///        generated task.
2498   ///
2499   /// \return an error, if any were triggered during execution.
2500   using TargetTaskBodyCallbackTy =
2501       function_ref<Error(Value *DeviceID, Value *RTLoc,
2502                          IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2503 
2504   /// Generate a target-task for the target construct
2505   ///
2506   /// \param TaskBodyCB Callback to generate the actual body of the target task.
2507   /// \param DeviceID Identifier for the device via the 'device' clause.
2508   /// \param RTLoc Source location identifier
2509   /// \param AllocaIP The insertion point to be used for alloca instructions.
2510   /// \param Dependencies Vector of DependData objects holding information of
2511   ///        dependencies as specified by the 'depend' clause.
2512   /// \param HasNoWait True if the target construct had 'nowait' on it, false
2513   ///        otherwise
2514   LLVM_ABI InsertPointOrErrorTy emitTargetTask(
2515       TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2516       OpenMPIRBuilder::InsertPointTy AllocaIP,
2517       const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
2518       const TargetDataRTArgs &RTArgs, bool HasNoWait);
2519 
2520   /// Emit the arguments to be passed to the runtime library based on the
2521   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
2522   /// ForEndCall, emit map types to be passed for the end of the region instead
2523   /// of the beginning.
2524   LLVM_ABI void emitOffloadingArraysArgument(
2525       IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2526       OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false);
2527 
2528   /// Emit an array of struct descriptors to be assigned to the offload args.
2529   LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2530                                             InsertPointTy CodeGenIP,
2531                                             MapInfosTy &CombinedInfo,
2532                                             TargetDataInfo &Info);
2533 
2534   /// Emit the arrays used to pass the captures and map information to the
2535   /// offloading runtime library. If there is no map or capture information,
2536   /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2537   /// that contains information generated for mappable clauses,
2538   /// including base pointers, pointers, sizes, map types, user-defined mappers.
2539   LLVM_ABI Error emitOffloadingArrays(
2540       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2541       TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
2542       bool IsNonContiguous = false,
2543       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2544 
2545   /// Allocates memory for and populates the arrays required for offloading
2546   /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2547   /// emits their base addresses as arguments to be passed to the runtime
2548   /// library. In essence, this function is a combination of
2549   /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2550   /// be preferred by clients of OpenMPIRBuilder.
2551   LLVM_ABI Error emitOffloadingArraysAndArgs(
2552       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2553       TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2554       CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
2555       bool ForEndCall = false,
2556       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2557 
2558   /// Creates offloading entry for the provided entry ID \a ID, address \a
2559   /// Addr, size \a Size, and flags \a Flags.
2560   LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2561                                    int32_t Flags, GlobalValue::LinkageTypes,
2562                                    StringRef Name = "");
2563 
2564   /// The kind of errors that can occur when emitting the offload entries and
2565   /// metadata.
2566   enum EmitMetadataErrorKind {
2567     EMIT_MD_TARGET_REGION_ERROR,
2568     EMIT_MD_DECLARE_TARGET_ERROR,
2569     EMIT_MD_GLOBAL_VAR_LINK_ERROR
2570   };
2571 
2572   /// Callback function type
2573   using EmitMetadataErrorReportFunctionTy =
2574       std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2575 
2576   // Emit the offloading entries and metadata so that the device codegen side
2577   // can easily figure out what to emit. The produced metadata looks like
2578   // this:
2579   //
2580   // !omp_offload.info = !{!1, ...}
2581   //
2582   // We only generate metadata for function that contain target regions.
2583   LLVM_ABI void createOffloadEntriesAndInfoMetadata(
2584       EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2585 
2586 public:
2587   /// Generator for __kmpc_copyprivate
2588   ///
2589   /// \param Loc The source location description.
2590   /// \param BufSize Number of elements in the buffer.
2591   /// \param CpyBuf List of pointers to data to be copied.
2592   /// \param CpyFn function to call for copying data.
2593   /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2594   ///
2595   /// \return The insertion position *after* the CopyPrivate call.
2596 
2597   LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2598                                            llvm::Value *BufSize,
2599                                            llvm::Value *CpyBuf,
2600                                            llvm::Value *CpyFn,
2601                                            llvm::Value *DidIt);
2602 
2603   /// Generator for '#omp single'
2604   ///
2605   /// \param Loc The source location description.
2606   /// \param BodyGenCB Callback that will generate the region code.
2607   /// \param FiniCB Callback to finalize variable copies.
2608   /// \param IsNowait If false, a barrier is emitted.
2609   /// \param CPVars copyprivate variables.
2610   /// \param CPFuncs copy functions to use for each copyprivate variable.
2611   ///
2612   /// \returns The insertion position *after* the single call.
2613   LLVM_ABI InsertPointOrErrorTy
2614   createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2615                FinalizeCallbackTy FiniCB, bool IsNowait,
2616                ArrayRef<llvm::Value *> CPVars = {},
2617                ArrayRef<llvm::Function *> CPFuncs = {});
2618 
2619   /// Generator for '#omp master'
2620   ///
2621   /// \param Loc The insert and source location description.
2622   /// \param BodyGenCB Callback that will generate the region code.
2623   /// \param FiniCB Callback to finalize variable copies.
2624   ///
2625   /// \returns The insertion position *after* the master.
2626   LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2627                                              BodyGenCallbackTy BodyGenCB,
2628                                              FinalizeCallbackTy FiniCB);
2629 
2630   /// Generator for '#omp masked'
2631   ///
2632   /// \param Loc The insert and source location description.
2633   /// \param BodyGenCB Callback that will generate the region code.
2634   /// \param FiniCB Callback to finialize variable copies.
2635   ///
2636   /// \returns The insertion position *after* the masked.
2637   LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2638                                              BodyGenCallbackTy BodyGenCB,
2639                                              FinalizeCallbackTy FiniCB,
2640                                              Value *Filter);
2641 
2642   /// Generator for '#omp critical'
2643   ///
2644   /// \param Loc The insert and source location description.
2645   /// \param BodyGenCB Callback that will generate the region body code.
2646   /// \param FiniCB Callback to finalize variable copies.
2647   /// \param CriticalName name of the lock used by the critical directive
2648   /// \param HintInst Hint Instruction for hint clause associated with critical
2649   ///
2650   /// \returns The insertion position *after* the critical.
2651   LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2652                                                BodyGenCallbackTy BodyGenCB,
2653                                                FinalizeCallbackTy FiniCB,
2654                                                StringRef CriticalName,
2655                                                Value *HintInst);
2656 
2657   /// Generator for '#omp ordered depend (source | sink)'
2658   ///
2659   /// \param Loc The insert and source location description.
2660   /// \param AllocaIP The insertion point to be used for alloca instructions.
2661   /// \param NumLoops The number of loops in depend clause.
2662   /// \param StoreValues The value will be stored in vector address.
2663   /// \param Name The name of alloca instruction.
2664   /// \param IsDependSource If true, depend source; otherwise, depend sink.
2665   ///
2666   /// \return The insertion position *after* the ordered.
2667   LLVM_ABI InsertPointTy
2668   createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP,
2669                       unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues,
2670                       const Twine &Name, bool IsDependSource);
2671 
2672   /// Generator for '#omp ordered [threads | simd]'
2673   ///
2674   /// \param Loc The insert and source location description.
2675   /// \param BodyGenCB Callback that will generate the region code.
2676   /// \param FiniCB Callback to finalize variable copies.
2677   /// \param IsThreads If true, with threads clause or without clause;
2678   /// otherwise, with simd clause;
2679   ///
2680   /// \returns The insertion position *after* the ordered.
2681   LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(
2682       const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2683       FinalizeCallbackTy FiniCB, bool IsThreads);
2684 
2685   /// Generator for '#omp sections'
2686   ///
2687   /// \param Loc The insert and source location description.
2688   /// \param AllocaIP The insertion points to be used for alloca instructions.
2689   /// \param SectionCBs Callbacks that will generate body of each section.
2690   /// \param PrivCB Callback to copy a given variable (think copy constructor).
2691   /// \param FiniCB Callback to finalize variable copies.
2692   /// \param IsCancellable Flag to indicate a cancellable parallel region.
2693   /// \param IsNowait If true, barrier - to ensure all sections are executed
2694   /// before moving forward will not be generated.
2695   /// \returns The insertion position *after* the sections.
2696   LLVM_ABI InsertPointOrErrorTy
2697   createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
2698                  ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2699                  PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
2700                  bool IsCancellable, bool IsNowait);
2701 
2702   /// Generator for '#omp section'
2703   ///
2704   /// \param Loc The insert and source location description.
2705   /// \param BodyGenCB Callback that will generate the region body code.
2706   /// \param FiniCB Callback to finalize variable copies.
2707   /// \returns The insertion position *after* the section.
2708   LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc,
2709                                               BodyGenCallbackTy BodyGenCB,
2710                                               FinalizeCallbackTy FiniCB);
2711 
2712   /// Generator for `#omp teams`
2713   ///
2714   /// \param Loc The location where the teams construct was encountered.
2715   /// \param BodyGenCB Callback that will generate the region code.
2716   /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2717   ///        it is as if lower bound is specified as equal to upperbound. If
2718   ///        this is non-null, then upperbound must also be non-null.
2719   /// \param NumTeamsUpper Upper bound on the number of teams.
2720   /// \param ThreadLimit on the number of threads that may participate in a
2721   ///        contention group created by each team.
2722   /// \param IfExpr is the integer argument value of the if condition on the
2723   ///        teams clause.
2724   LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc,
2725                                             BodyGenCallbackTy BodyGenCB,
2726                                             Value *NumTeamsLower = nullptr,
2727                                             Value *NumTeamsUpper = nullptr,
2728                                             Value *ThreadLimit = nullptr,
2729                                             Value *IfExpr = nullptr);
2730 
2731   /// Generator for `#omp distribute`
2732   ///
2733   /// \param Loc The location where the distribute construct was encountered.
2734   /// \param AllocaIP The insertion points to be used for alloca instructions.
2735   /// \param BodyGenCB Callback that will generate the region code.
2736   LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
2737                                                  InsertPointTy AllocaIP,
2738                                                  BodyGenCallbackTy BodyGenCB);
2739 
2740   /// Generate conditional branch and relevant BasicBlocks through which private
2741   /// threads copy the 'copyin' variables from Master copy to threadprivate
2742   /// copies.
2743   ///
2744   /// \param IP insertion block for copyin conditional
2745   /// \param MasterVarPtr a pointer to the master variable
2746   /// \param PrivateVarPtr a pointer to the threadprivate variable
2747   /// \param IntPtrTy Pointer size type
2748   /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2749   //				 and copy.in.end block
2750   ///
2751   /// \returns The insertion point where copying operation to be emitted.
2752   LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP,
2753                                                   Value *MasterAddr,
2754                                                   Value *PrivateAddr,
2755                                                   llvm::IntegerType *IntPtrTy,
2756                                                   bool BranchtoEnd = true);
2757 
2758   /// Create a runtime call for kmpc_Alloc
2759   ///
2760   /// \param Loc The insert and source location description.
2761   /// \param Size Size of allocated memory space
2762   /// \param Allocator Allocator information instruction
2763   /// \param Name Name of call Instruction for OMP_alloc
2764   ///
2765   /// \returns CallInst to the OMP_Alloc call
2766   LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2767                                     Value *Allocator, std::string Name = "");
2768 
2769   /// Create a runtime call for kmpc_free
2770   ///
2771   /// \param Loc The insert and source location description.
2772   /// \param Addr Address of memory space to be freed
2773   /// \param Allocator Allocator information instruction
2774   /// \param Name Name of call Instruction for OMP_Free
2775   ///
2776   /// \returns CallInst to the OMP_Free call
2777   LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2778                                    Value *Allocator, std::string Name = "");
2779 
2780   /// Create a runtime call for kmpc_threadprivate_cached
2781   ///
2782   /// \param Loc The insert and source location description.
2783   /// \param Pointer pointer to data to be cached
2784   /// \param Size size of data to be cached
2785   /// \param Name Name of call Instruction for callinst
2786   ///
2787   /// \returns CallInst to the thread private cache call.
2788   LLVM_ABI CallInst *
2789   createCachedThreadPrivate(const LocationDescription &Loc,
2790                             llvm::Value *Pointer, llvm::ConstantInt *Size,
2791                             const llvm::Twine &Name = Twine(""));
2792 
2793   /// Create a runtime call for __tgt_interop_init
2794   ///
2795   /// \param Loc The insert and source location description.
2796   /// \param InteropVar variable to be allocated
2797   /// \param InteropType type of interop operation
2798   /// \param Device devide to which offloading will occur
2799   /// \param NumDependences  number of dependence variables
2800   /// \param DependenceAddress pointer to dependence variables
2801   /// \param HaveNowaitClause does nowait clause exist
2802   ///
2803   /// \returns CallInst to the __tgt_interop_init call
2804   LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc,
2805                                           Value *InteropVar,
2806                                           omp::OMPInteropType InteropType,
2807                                           Value *Device, Value *NumDependences,
2808                                           Value *DependenceAddress,
2809                                           bool HaveNowaitClause);
2810 
2811   /// Create a runtime call for __tgt_interop_destroy
2812   ///
2813   /// \param Loc The insert and source location description.
2814   /// \param InteropVar variable to be allocated
2815   /// \param Device devide to which offloading will occur
2816   /// \param NumDependences  number of dependence variables
2817   /// \param DependenceAddress pointer to dependence variables
2818   /// \param HaveNowaitClause does nowait clause exist
2819   ///
2820   /// \returns CallInst to the __tgt_interop_destroy call
2821   LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2822                                              Value *InteropVar, Value *Device,
2823                                              Value *NumDependences,
2824                                              Value *DependenceAddress,
2825                                              bool HaveNowaitClause);
2826 
2827   /// Create a runtime call for __tgt_interop_use
2828   ///
2829   /// \param Loc The insert and source location description.
2830   /// \param InteropVar variable to be allocated
2831   /// \param Device devide to which offloading will occur
2832   /// \param NumDependences  number of dependence variables
2833   /// \param DependenceAddress pointer to dependence variables
2834   /// \param HaveNowaitClause does nowait clause exist
2835   ///
2836   /// \returns CallInst to the __tgt_interop_use call
2837   LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc,
2838                                          Value *InteropVar, Value *Device,
2839                                          Value *NumDependences,
2840                                          Value *DependenceAddress,
2841                                          bool HaveNowaitClause);
2842 
2843   /// The `omp target` interface
2844   ///
2845   /// For more information about the usage of this interface,
2846   /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2847   ///
2848   ///{
2849 
2850   /// Create a runtime call for kmpc_target_init
2851   ///
2852   /// \param Loc The insert and source location description.
2853   /// \param Attrs Structure containing the default attributes, including
2854   ///        numbers of threads and teams to launch the kernel with.
2855   LLVM_ABI InsertPointTy createTargetInit(
2856       const LocationDescription &Loc,
2857       const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
2858 
2859   /// Create a runtime call for kmpc_target_deinit
2860   ///
2861   /// \param Loc The insert and source location description.
2862   /// \param TeamsReductionDataSize The maximal size of all the reduction data
2863   ///        for teams reduction.
2864   /// \param TeamsReductionBufferLength The number of elements (each of up to
2865   ///        \p TeamsReductionDataSize size), in the teams reduction buffer.
2866   LLVM_ABI void createTargetDeinit(const LocationDescription &Loc,
2867                                    int32_t TeamsReductionDataSize = 0,
2868                                    int32_t TeamsReductionBufferLength = 1024);
2869 
2870   ///}
2871 
2872   /// Helpers to read/write kernel annotations from the IR.
2873   ///
2874   ///{
2875 
2876   /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
2877   /// is set.
2878   LLVM_ABI static std::pair<int32_t, int32_t>
2879   readThreadBoundsForKernel(const Triple &T, Function &Kernel);
2880   LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T,
2881                                                   Function &Kernel, int32_t LB,
2882                                                   int32_t UB);
2883 
2884   /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
2885   /// is set.
2886   LLVM_ABI static std::pair<int32_t, int32_t>
2887   readTeamBoundsForKernel(const Triple &T, Function &Kernel);
2888   LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel,
2889                                            int32_t LB, int32_t UB);
2890   ///}
2891 
2892 private:
2893   // Sets the function attributes expected for the outlined function
2894   void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
2895 
2896   // Creates the function ID/Address for the given outlined function.
2897   // In the case of an embedded device function the address of the function is
2898   // used, in the case of a non-offload function a constant is created.
2899   Constant *createOutlinedFunctionID(Function *OutlinedFn,
2900                                      StringRef EntryFnIDName);
2901 
2902   // Creates the region entry address for the outlined function
2903   Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
2904                                         StringRef EntryFnName);
2905 
2906 public:
2907   /// Functions used to generate a function with the given name.
2908   using FunctionGenCallback =
2909       std::function<Expected<Function *>(StringRef FunctionName)>;
2910 
2911   /// Create a unique name for the entry function using the source location
2912   /// information of the current target region. The name will be something like:
2913   ///
2914   /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
2915   ///
2916   /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
2917   /// mangled name of the function that encloses the target region and BB is the
2918   /// line number of the target region. CC is a count added when more than one
2919   /// region is located at the same location.
2920   ///
2921   /// If this target outline function is not an offload entry, we don't need to
2922   /// register it. This may happen if it is guarded by an if clause that is
2923   /// false at compile time, or no target archs have been specified.
2924   ///
2925   /// The created target region ID is used by the runtime library to identify
2926   /// the current target region, so it only has to be unique and not
2927   /// necessarily point to anything. It could be the pointer to the outlined
2928   /// function that implements the target region, but we aren't using that so
2929   /// that the compiler doesn't need to keep that, and could therefore inline
2930   /// the host function if proven worthwhile during optimization. In the other
2931   /// hand, if emitting code for the device, the ID has to be the function
2932   /// address so that it can retrieved from the offloading entry and launched
2933   /// by the runtime library. We also mark the outlined function to have
2934   /// external linkage in case we are emitting code for the device, because
2935   /// these functions will be entry points to the device.
2936   ///
2937   /// \param InfoManager The info manager keeping track of the offload entries
2938   /// \param EntryInfo The entry information about the function
2939   /// \param GenerateFunctionCallback The callback function to generate the code
2940   /// \param OutlinedFunction Pointer to the outlined function
2941   /// \param EntryFnIDName Name of the ID o be created
2942   LLVM_ABI Error emitTargetRegionFunction(
2943       TargetRegionEntryInfo &EntryInfo,
2944       FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
2945       Function *&OutlinedFn, Constant *&OutlinedFnID);
2946 
2947   /// Registers the given function and sets up the attribtues of the function
2948   /// Returns the FunctionID.
2949   ///
2950   /// \param InfoManager The info manager keeping track of the offload entries
2951   /// \param EntryInfo The entry information about the function
2952   /// \param OutlinedFunction Pointer to the outlined function
2953   /// \param EntryFnName Name of the outlined function
2954   /// \param EntryFnIDName Name of the ID o be created
2955   LLVM_ABI Constant *
2956   registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
2957                                Function *OutlinedFunction,
2958                                StringRef EntryFnName, StringRef EntryFnIDName);
2959 
2960   /// Type of BodyGen to use for region codegen
2961   ///
2962   /// Priv: If device pointer privatization is required, emit the body of the
2963   /// region here. It will have to be duplicated: with and without
2964   /// privatization.
2965   /// DupNoPriv: If we need device pointer privatization, we need
2966   /// to emit the body of the region with no privatization in the 'else' branch
2967   /// of the conditional.
2968   /// NoPriv: If we don't require privatization of device
2969   /// pointers, we emit the body in between the runtime calls. This avoids
2970   /// duplicating the body code.
2971   enum BodyGenTy { Priv, DupNoPriv, NoPriv };
2972 
2973   /// Callback type for creating the map infos for the kernel parameters.
2974   /// \param CodeGenIP is the insertion point where code should be generated,
2975   ///        if any.
2976   using GenMapInfoCallbackTy =
2977       function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
2978 
2979 private:
2980   /// Emit the array initialization or deletion portion for user-defined mapper
2981   /// code generation. First, it evaluates whether an array section is mapped
2982   /// and whether the \a MapType instructs to delete this section. If \a IsInit
2983   /// is true, and \a MapType indicates to not delete this array, array
2984   /// initialization code is generated. If \a IsInit is false, and \a MapType
2985   /// indicates to delete this array, array deletion code is generated.
2986   void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
2987                                   llvm::Value *Base, llvm::Value *Begin,
2988                                   llvm::Value *Size, llvm::Value *MapType,
2989                                   llvm::Value *MapName, TypeSize ElementSize,
2990                                   llvm::BasicBlock *ExitBB, bool IsInit);
2991 
2992 public:
2993   /// Emit the user-defined mapper function. The code generation follows the
2994   /// pattern in the example below.
2995   /// \code
2996   /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
2997   ///                                           void *base, void *begin,
2998   ///                                           int64_t size, int64_t type,
2999   ///                                           void *name = nullptr) {
3000   ///   // Allocate space for an array section first or add a base/begin for
3001   ///   // pointer dereference.
3002   ///   if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
3003   ///       !maptype.IsDelete)
3004   ///     __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3005   ///                                 size*sizeof(Ty), clearToFromMember(type));
3006   ///   // Map members.
3007   ///   for (unsigned i = 0; i < size; i++) {
3008   ///     // For each component specified by this mapper:
3009   ///     for (auto c : begin[i]->all_components) {
3010   ///       if (c.hasMapper())
3011   ///         (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
3012   ///         c.arg_size,
3013   ///                       c.arg_type, c.arg_name);
3014   ///       else
3015   ///         __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
3016   ///                                     c.arg_begin, c.arg_size, c.arg_type,
3017   ///                                     c.arg_name);
3018   ///     }
3019   ///   }
3020   ///   // Delete the array section.
3021   ///   if (size > 1 && maptype.IsDelete)
3022   ///     __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3023   ///                                 size*sizeof(Ty), clearToFromMember(type));
3024   /// }
3025   /// \endcode
3026   ///
3027   /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
3028   /// MapInfos and returns.
3029   /// \param ElemTy DeclareMapper element type.
3030   /// \param FuncName Optional param to specify mapper function name.
3031   /// \param CustomMapperCB Optional callback to generate code related to
3032   /// custom mappers.
3033   LLVM_ABI Expected<Function *> emitUserDefinedMapper(
3034       function_ref<MapInfosOrErrorTy(
3035           InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
3036           PrivAndGenMapInfoCB,
3037       llvm::Type *ElemTy, StringRef FuncName,
3038       CustomMapperCallbackTy CustomMapperCB);
3039 
3040   /// Generator for '#omp target data'
3041   ///
3042   /// \param Loc The location where the target data construct was encountered.
3043   /// \param AllocaIP The insertion points to be used for alloca instructions.
3044   /// \param CodeGenIP The insertion point at which the target directive code
3045   /// should be placed.
3046   /// \param IsBegin If true then emits begin mapper call otherwise emits
3047   /// end mapper call.
3048   /// \param DeviceID Stores the DeviceID from the device clause.
3049   /// \param IfCond Value which corresponds to the if clause condition.
3050   /// \param Info Stores all information realted to the Target Data directive.
3051   /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
3052   /// \param CustomMapperCB Callback to generate code related to
3053   /// custom mappers.
3054   /// \param BodyGenCB Optional Callback to generate the region code.
3055   /// \param DeviceAddrCB Optional callback to generate code related to
3056   /// use_device_ptr and use_device_addr.
3057   LLVM_ABI InsertPointOrErrorTy createTargetData(
3058       const LocationDescription &Loc, InsertPointTy AllocaIP,
3059       InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
3060       TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
3061       CustomMapperCallbackTy CustomMapperCB,
3062       omp::RuntimeFunction *MapperFunc = nullptr,
3063       function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
3064                                         BodyGenTy BodyGenType)>
3065           BodyGenCB = nullptr,
3066       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
3067       Value *SrcLocInfo = nullptr);
3068 
3069   using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
3070       InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
3071 
3072   using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
3073       Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
3074       InsertPointTy CodeGenIP)>;
3075 
3076   /// Generator for '#omp target'
3077   ///
3078   /// \param Loc where the target data construct was encountered.
3079   /// \param IsOffloadEntry whether it is an offload entry.
3080   /// \param CodeGenIP The insertion point where the call to the outlined
3081   ///        function should be emitted.
3082   /// \param Info Stores all information realted to the Target directive.
3083   /// \param EntryInfo The entry information about the function.
3084   /// \param DefaultAttrs Structure containing the default attributes, including
3085   ///        numbers of threads and teams to launch the kernel with.
3086   /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3087   ///        and teams to launch the kernel with.
3088   /// \param IfCond value of the `if` clause.
3089   /// \param Inputs The input values to the region that will be passed.
3090   ///        as arguments to the outlined function.
3091   /// \param BodyGenCB Callback that will generate the region code.
3092   /// \param ArgAccessorFuncCB Callback that will generate accessors
3093   ///        instructions for passed in target arguments where neccessary
3094   /// \param CustomMapperCB Callback to generate code related to
3095   /// custom mappers.
3096   /// \param Dependencies A vector of DependData objects that carry
3097   ///        dependency information as passed in the depend clause
3098   /// \param HasNowait Whether the target construct has a `nowait` clause or
3099   ///        not.
3100   LLVM_ABI InsertPointOrErrorTy createTarget(
3101       const LocationDescription &Loc, bool IsOffloadEntry,
3102       OpenMPIRBuilder::InsertPointTy AllocaIP,
3103       OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
3104       TargetRegionEntryInfo &EntryInfo,
3105       const TargetKernelDefaultAttrs &DefaultAttrs,
3106       const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3107       SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3108       TargetBodyGenCallbackTy BodyGenCB,
3109       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3110       CustomMapperCallbackTy CustomMapperCB,
3111       const SmallVector<DependData> &Dependencies, bool HasNowait = false);
3112 
3113   /// Returns __kmpc_for_static_init_* runtime function for the specified
3114   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3115   /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3116   LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize,
3117                                                       bool IVSigned,
3118                                                       bool IsGPUDistribute);
3119 
3120   /// Returns __kmpc_dispatch_init_* runtime function for the specified
3121   /// size \a IVSize and sign \a IVSigned.
3122   LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize,
3123                                                      bool IVSigned);
3124 
3125   /// Returns __kmpc_dispatch_next_* runtime function for the specified
3126   /// size \a IVSize and sign \a IVSigned.
3127   LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize,
3128                                                      bool IVSigned);
3129 
3130   /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3131   /// size \a IVSize and sign \a IVSigned.
3132   LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize,
3133                                                      bool IVSigned);
3134 
3135   /// Returns __kmpc_dispatch_deinit runtime function.
3136   LLVM_ABI FunctionCallee createDispatchDeinitFunction();
3137 
3138   /// Declarations for LLVM-IR types (simple, array, function and structure) are
3139   /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3140   /// we provide the declarations, the initializeTypes function will provide the
3141   /// values.
3142   ///
3143   ///{
3144 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3145 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
3146   ArrayType *VarName##Ty = nullptr;                                            \
3147   PointerType *VarName##PtrTy = nullptr;
3148 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
3149   FunctionType *VarName = nullptr;                                             \
3150   PointerType *VarName##Ptr = nullptr;
3151 #define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
3152   StructType *VarName = nullptr;                                               \
3153   PointerType *VarName##Ptr = nullptr;
3154 #include "llvm/Frontend/OpenMP/OMPKinds.def"
3155 
3156   ///}
3157 
3158 private:
3159   /// Create all simple and struct types exposed by the runtime and remember
3160   /// the llvm::PointerTypes of them for easy access later.
3161   void initializeTypes(Module &M);
3162 
3163   /// Common interface for generating entry calls for OMP Directives.
3164   /// if the directive has a region/body, It will set the insertion
3165   /// point to the body
3166   ///
3167   /// \param OMPD Directive to generate entry blocks for
3168   /// \param EntryCall Call to the entry OMP Runtime Function
3169   /// \param ExitBB block where the region ends.
3170   /// \param Conditional indicate if the entry call result will be used
3171   ///        to evaluate a conditional of whether a thread will execute
3172   ///        body code or not.
3173   ///
3174   /// \return The insertion position in exit block
3175   InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3176                                          BasicBlock *ExitBB,
3177                                          bool Conditional = false);
3178 
3179   /// Common interface to finalize the region
3180   ///
3181   /// \param OMPD Directive to generate exiting code for
3182   /// \param FinIP Insertion point for emitting Finalization code and exit call
3183   /// \param ExitCall Call to the ending OMP Runtime Function
3184   /// \param HasFinalize indicate if the directive will require finalization
3185   ///         and has a finalization callback in the stack that
3186   ///        should be called.
3187   ///
3188   /// \return The insertion position in exit block
3189   InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3190                                                InsertPointTy FinIP,
3191                                                Instruction *ExitCall,
3192                                                bool HasFinalize = true);
3193 
3194   /// Common Interface to generate OMP inlined regions
3195   ///
3196   /// \param OMPD Directive to generate inlined region for
3197   /// \param EntryCall Call to the entry OMP Runtime Function
3198   /// \param ExitCall Call to the ending OMP Runtime Function
3199   /// \param BodyGenCB Body code generation callback.
3200   /// \param FiniCB Finalization Callback. Will be called when finalizing region
3201   /// \param Conditional indicate if the entry call result will be used
3202   ///        to evaluate a conditional of whether a thread will execute
3203   ///        body code or not.
3204   /// \param HasFinalize indicate if the directive will require finalization
3205   ///        and has a finalization callback in the stack that
3206   ///        should be called.
3207   /// \param IsCancellable if HasFinalize is set to true, indicate if the
3208   ///        the directive should be cancellable.
3209   /// \return The insertion point after the region
3210   InsertPointOrErrorTy
3211   EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3212                        Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3213                        FinalizeCallbackTy FiniCB, bool Conditional = false,
3214                        bool HasFinalize = true, bool IsCancellable = false);
3215 
3216   /// Get the platform-specific name separator.
3217   /// \param Parts different parts of the final name that needs separation
3218   /// \param FirstSeparator First separator used between the initial two
3219   ///        parts of the name.
3220   /// \param Separator separator used between all of the rest consecutive
3221   ///        parts of the name
3222   static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3223                                            StringRef FirstSeparator,
3224                                            StringRef Separator);
3225 
3226   /// Returns corresponding lock object for the specified critical region
3227   /// name. If the lock object does not exist it is created, otherwise the
3228   /// reference to the existing copy is returned.
3229   /// \param CriticalName Name of the critical region.
3230   ///
3231   Value *getOMPCriticalRegionLock(StringRef CriticalName);
3232 
3233   /// Callback type for Atomic Expression update
3234   /// ex:
3235   /// \code{.cpp}
3236   /// unsigned x = 0;
3237   /// #pragma omp atomic update
3238   /// x = Expr(x_old);  //Expr() is any legal operation
3239   /// \endcode
3240   ///
3241   /// \param XOld the value of the atomic memory address to use for update
3242   /// \param IRB reference to the IRBuilder to use
3243   ///
3244   /// \returns Value to update X to.
3245   using AtomicUpdateCallbackTy =
3246       const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3247 
3248 private:
3249   enum AtomicKind { Read, Write, Update, Capture, Compare };
3250 
3251   /// Determine whether to emit flush or not
3252   ///
3253   /// \param Loc    The insert and source location description.
3254   /// \param AO     The required atomic ordering
3255   /// \param AK     The OpenMP atomic operation kind used.
3256   ///
3257   /// \returns		wether a flush was emitted or not
3258   bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3259                                     AtomicOrdering AO, AtomicKind AK);
3260 
3261   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3262   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3263   /// Only Scalar data types.
3264   ///
3265   /// \param AllocaIP	  The insertion point to be used for alloca
3266   ///                   instructions.
3267   /// \param X			    The target atomic pointer to be updated
3268   /// \param XElemTy    The element type of the atomic pointer.
3269   /// \param Expr		    The value to update X with.
3270   /// \param AO			    Atomic ordering of the generated atomic
3271   ///                   instructions.
3272   /// \param RMWOp		  The binary operation used for update. If
3273   ///                   operation is not supported by atomicRMW,
3274   ///                   or belong to {FADD, FSUB, BAD_BINOP}.
3275   ///                   Then a `cmpExch` based	atomic will be generated.
3276   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3277   ///                   expressed through atomicrmw instruction.
3278   /// \param VolatileX	     true if \a X volatile?
3279   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3280   ///                     update expression, false otherwise.
3281   ///                     (e.g. true for X = X BinOp Expr)
3282   ///
3283   /// \returns A pair of the old value of X before the update, and the value
3284   ///          used for the update.
3285   Expected<std::pair<Value *, Value *>>
3286   emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3287                    AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3288                    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3289                    bool IsXBinopExpr);
3290 
3291   /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3292   ///
3293   /// \Return The instruction
3294   Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3295                                 AtomicRMWInst::BinOp RMWOp);
3296 
3297   bool IsFinalized;
3298 
3299 public:
3300   /// a struct to pack relevant information while generating atomic Ops
3301   struct AtomicOpValue {
3302     Value *Var = nullptr;
3303     Type *ElemTy = nullptr;
3304     bool IsSigned = false;
3305     bool IsVolatile = false;
3306   };
3307 
3308   /// Emit atomic Read for : V = X --- Only Scalar data types.
3309   ///
3310   /// \param Loc    The insert and source location description.
3311   /// \param X			The target pointer to be atomically read
3312   /// \param V			Memory address where to store atomically read
3313   /// 					    value
3314   /// \param AO			Atomic ordering of the generated atomic
3315   /// 					    instructions.
3316   /// \param AllocaIP           Insert point for allocas
3317   //
3318   /// \return Insertion point after generated atomic read IR.
3319   LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc,
3320                                           AtomicOpValue &X, AtomicOpValue &V,
3321                                           AtomicOrdering AO,
3322                                           InsertPointTy AllocaIP);
3323 
3324   /// Emit atomic write for : X = Expr --- Only Scalar data types.
3325   ///
3326   /// \param Loc    The insert and source location description.
3327   /// \param X			The target pointer to be atomically written to
3328   /// \param Expr		The value to store.
3329   /// \param AO			Atomic ordering of the generated atomic
3330   ///               instructions.
3331   /// \param AllocaIP           Insert point for allocas
3332   ///
3333   /// \return Insertion point after generated atomic Write IR.
3334   LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3335                                            AtomicOpValue &X, Value *Expr,
3336                                            AtomicOrdering AO,
3337                                            InsertPointTy AllocaIP);
3338 
3339   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3340   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3341   /// Only Scalar data types.
3342   ///
3343   /// \param Loc      The insert and source location description.
3344   /// \param AllocaIP The insertion point to be used for alloca instructions.
3345   /// \param X        The target atomic pointer to be updated
3346   /// \param Expr     The value to update X with.
3347   /// \param AO       Atomic ordering of the generated atomic instructions.
3348   /// \param RMWOp    The binary operation used for update. If operation
3349   ///                 is	not supported by atomicRMW, or belong to
3350   ///	                {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3351   ///                 atomic will be generated.
3352   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3353   ///                   expressed through atomicrmw instruction.
3354   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3355   ///                     update expression, false otherwise.
3356   ///	                    (e.g. true for X = X BinOp Expr)
3357   ///
3358   /// \return Insertion point after generated atomic update IR.
3359   LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(
3360       const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3361       Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3362       AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr);
3363 
3364   /// Emit atomic update for constructs: --- Only Scalar data types
3365   /// V = X; X = X BinOp Expr ,
3366   /// X = X BinOp Expr; V = X,
3367   /// V = X; X = Expr BinOp X,
3368   /// X = Expr BinOp X; V = X,
3369   /// V = X; X = UpdateOp(X),
3370   /// X = UpdateOp(X); V = X,
3371   ///
3372   /// \param Loc        The insert and source location description.
3373   /// \param AllocaIP   The insertion point to be used for alloca instructions.
3374   /// \param X          The target atomic pointer to be updated
3375   /// \param V          Memory address where to store captured value
3376   /// \param Expr       The value to update X with.
3377   /// \param AO         Atomic ordering of the generated atomic instructions
3378   /// \param RMWOp      The binary operation used for update. If
3379   ///                   operation is not supported by atomicRMW, or belong to
3380   ///	                  {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3381   ///                   atomic will be generated.
3382   /// \param UpdateOp   Code generator for complex expressions that cannot be
3383   ///                   expressed through atomicrmw instruction.
3384   /// \param UpdateExpr true if X is an in place update of the form
3385   ///                   X = X BinOp Expr or X = Expr BinOp X
3386   /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3387   ///                     update expression, false otherwise.
3388   ///                     (e.g. true for X = X BinOp Expr)
3389   /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3390   ///                        'v', not an updated one.
3391   ///
3392   /// \return Insertion point after generated atomic capture IR.
3393   LLVM_ABI InsertPointOrErrorTy createAtomicCapture(
3394       const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3395       AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
3396       AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
3397       bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr);
3398 
3399   /// Emit atomic compare for constructs: --- Only scalar data types
3400   /// cond-expr-stmt:
3401   /// x = x ordop expr ? expr : x;
3402   /// x = expr ordop x ? expr : x;
3403   /// x = x == e ? d : x;
3404   /// x = e == x ? d : x; (this one is not in the spec)
3405   /// cond-update-stmt:
3406   /// if (x ordop expr) { x = expr; }
3407   /// if (expr ordop x) { x = expr; }
3408   /// if (x == e) { x = d; }
3409   /// if (e == x) { x = d; } (this one is not in the spec)
3410   /// conditional-update-capture-atomic:
3411   /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3412   /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3413   /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3414   ///                                         IsFailOnly=true)
3415   /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3416   /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3417   ///                                                IsFailOnly=true)
3418   ///
3419   /// \param Loc          The insert and source location description.
3420   /// \param X            The target atomic pointer to be updated.
3421   /// \param V            Memory address where to store captured value (for
3422   ///                     compare capture only).
3423   /// \param R            Memory address where to store comparison result
3424   ///                     (for compare capture with '==' only).
3425   /// \param E            The expected value ('e') for forms that use an
3426   ///                     equality comparison or an expression ('expr') for
3427   ///                     forms that use 'ordop' (logically an atomic maximum or
3428   ///                     minimum).
3429   /// \param D            The desired value for forms that use an equality
3430   ///                     comparison. If forms that use 'ordop', it should be
3431   ///                     \p nullptr.
3432   /// \param AO           Atomic ordering of the generated atomic instructions.
3433   /// \param Op           Atomic compare operation. It can only be ==, <, or >.
3434   /// \param IsXBinopExpr True if the conditional statement is in the form where
3435   ///                     x is on LHS. It only matters for < or >.
3436   /// \param IsPostfixUpdate  True if original value of 'x' must be stored in
3437   ///                         'v', not an updated one (for compare capture
3438   ///                         only).
3439   /// \param IsFailOnly   True if the original value of 'x' is stored to 'v'
3440   ///                     only when the comparison fails. This is only valid for
3441   ///                     the case the comparison is '=='.
3442   ///
3443   /// \return Insertion point after generated atomic capture IR.
3444   LLVM_ABI InsertPointTy
3445   createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3446                       AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3447                       AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3448                       bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3449   LLVM_ABI InsertPointTy createAtomicCompare(
3450       const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
3451       AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
3452       omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
3453       bool IsFailOnly, AtomicOrdering Failure);
3454 
3455   /// Create the control flow structure of a canonical OpenMP loop.
3456   ///
3457   /// The emitted loop will be disconnected, i.e. no edge to the loop's
3458   /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3459   /// IRBuilder location is not preserved.
3460   ///
3461   /// \param DL        DebugLoc used for the instructions in the skeleton.
3462   /// \param TripCount Value to be used for the trip count.
3463   /// \param F         Function in which to insert the BasicBlocks.
3464   /// \param PreInsertBefore  Where to insert BBs that execute before the body,
3465   ///                         typically the body itself.
3466   /// \param PostInsertBefore Where to insert BBs that execute after the body.
3467   /// \param Name      Base name used to derive BB
3468   ///                  and instruction names.
3469   ///
3470   /// \returns The CanonicalLoopInfo that represents the emitted loop.
3471   LLVM_ABI CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3472                                                  Function *F,
3473                                                  BasicBlock *PreInsertBefore,
3474                                                  BasicBlock *PostInsertBefore,
3475                                                  const Twine &Name = {});
3476   /// OMP Offload Info Metadata name string
3477   const std::string ompOffloadInfoName = "omp_offload.info";
3478 
3479   /// Loads all the offload entries information from the host IR
3480   /// metadata. This function is only meant to be used with device code
3481   /// generation.
3482   ///
3483   /// \param M         Module to load Metadata info from. Module passed maybe
3484   /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3485   LLVM_ABI void loadOffloadInfoMetadata(Module &M);
3486 
3487   /// Loads all the offload entries information from the host IR
3488   /// metadata read from the file passed in as the HostFilePath argument. This
3489   /// function is only meant to be used with device code generation.
3490   ///
3491   /// \param HostFilePath The path to the host IR file,
3492   /// used to load in offload metadata for the device, allowing host and device
3493   /// to maintain the same metadata mapping.
3494   LLVM_ABI void loadOffloadInfoMetadata(StringRef HostFilePath);
3495 
3496   /// Gets (if variable with the given name already exist) or creates
3497   /// internal global variable with the specified Name. The created variable has
3498   /// linkage CommonLinkage by default and is initialized by null value.
3499   /// \param Ty Type of the global variable. If it is exist already the type
3500   /// must be the same.
3501   /// \param Name Name of the variable.
3502   LLVM_ABI GlobalVariable *
3503   getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3504                               unsigned AddressSpace = 0);
3505 };
3506 
3507 /// Class to represented the control flow structure of an OpenMP canonical loop.
3508 ///
3509 /// The control-flow structure is standardized for easy consumption by
3510 /// directives associated with loops. For instance, the worksharing-loop
3511 /// construct may change this control flow such that each loop iteration is
3512 /// executed on only one thread. The constraints of a canonical loop in brief
3513 /// are:
3514 ///
3515 ///  * The number of loop iterations must have been computed before entering the
3516 ///    loop.
3517 ///
3518 ///  * Has an (unsigned) logical induction variable that starts at zero and
3519 ///    increments by one.
3520 ///
3521 ///  * The loop's CFG itself has no side-effects. The OpenMP specification
3522 ///    itself allows side-effects, but the order in which they happen, including
3523 ///    how often or whether at all, is unspecified. We expect that the frontend
3524 ///    will emit those side-effect instructions somewhere (e.g. before the loop)
3525 ///    such that the CanonicalLoopInfo itself can be side-effect free.
3526 ///
3527 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3528 /// execution of a loop body that satifies these constraints. It does NOT
3529 /// represent arbitrary SESE regions that happen to contain a loop. Do not use
3530 /// CanonicalLoopInfo for such purposes.
3531 ///
3532 /// The control flow can be described as follows:
3533 ///
3534 ///     Preheader
3535 ///        |
3536 ///  /-> Header
3537 ///  |     |
3538 ///  |    Cond---\
3539 ///  |     |     |
3540 ///  |    Body   |
3541 ///  |    | |    |
3542 ///  |   <...>   |
3543 ///  |    | |    |
3544 ///   \--Latch   |
3545 ///              |
3546 ///             Exit
3547 ///              |
3548 ///            After
3549 ///
3550 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3551 /// including) and end at AfterIP (at the After's first instruction, excluding).
3552 /// That is, instructions in the Preheader and After blocks (except the
3553 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3554 /// side-effects. Typically, the Preheader is used to compute the loop's trip
3555 /// count. The instructions from BodyIP (at the Body block's first instruction,
3556 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3557 /// control and thus can have side-effects. The body block is the single entry
3558 /// point into the loop body, which may contain arbitrary control flow as long
3559 /// as all control paths eventually branch to the Latch block.
3560 ///
3561 /// TODO: Consider adding another standardized BasicBlock between Body CFG and
3562 /// Latch to guarantee that there is only a single edge to the latch. It would
3563 /// make loop transformations easier to not needing to consider multiple
3564 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3565 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3566 /// executes after each body iteration.
3567 ///
3568 /// There must be no loop-carried dependencies through llvm::Values. This is
3569 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3570 /// for the induction variable.
3571 ///
3572 /// All code in Header, Cond, Latch and Exit (plus the terminator of the
3573 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3574 /// by assertOK(). They are expected to not be modified unless explicitly
3575 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3576 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3577 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3578 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3579 /// anymore as its underlying control flow may not exist anymore.
3580 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3581 /// may also return a new CanonicalLoopInfo that can be passed to other
3582 /// loop-associated construct implementing methods. These loop-transforming
3583 /// methods may either create a new CanonicalLoopInfo usually using
3584 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3585 /// modify one of the input CanonicalLoopInfo and return it as representing the
3586 /// modified loop. What is done is an implementation detail of
3587 /// transformation-implementing method and callers should always assume that the
3588 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3589 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3590 /// created by createCanonicalLoop, such that transforming methods do not have
3591 /// to special case where the CanonicalLoopInfo originated from.
3592 ///
3593 /// Generally, methods consuming CanonicalLoopInfo do not need an
3594 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3595 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3596 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3597 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3598 /// any InsertPoint in the Preheader, After or Block can still be used after
3599 /// calling such a method.
3600 ///
3601 /// TODO: Provide mechanisms for exception handling and cancellation points.
3602 ///
3603 /// Defined outside OpenMPIRBuilder because nested classes cannot be
3604 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3605 class CanonicalLoopInfo {
3606   friend class OpenMPIRBuilder;
3607 
3608 private:
3609   BasicBlock *Header = nullptr;
3610   BasicBlock *Cond = nullptr;
3611   BasicBlock *Latch = nullptr;
3612   BasicBlock *Exit = nullptr;
3613 
3614   // Hold the MLIR value for the `lastiter` of the canonical loop.
3615   Value *LastIter = nullptr;
3616 
3617   /// Add the control blocks of this loop to \p BBs.
3618   ///
3619   /// This does not include any block from the body, including the one returned
3620   /// by getBody().
3621   ///
3622   /// FIXME: This currently includes the Preheader and After blocks even though
3623   /// their content is (mostly) not under CanonicalLoopInfo's control.
3624   /// Re-evaluated whether this makes sense.
3625   void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3626 
3627   /// Sets the number of loop iterations to the given value. This value must be
3628   /// valid in the condition block (i.e., defined in the preheader) and is
3629   /// interpreted as an unsigned integer.
3630   void setTripCount(Value *TripCount);
3631 
3632   /// Replace all uses of the canonical induction variable in the loop body with
3633   /// a new one.
3634   ///
3635   /// The intended use case is to update the induction variable for an updated
3636   /// iteration space such that it can stay normalized in the 0...tripcount-1
3637   /// range.
3638   ///
3639   /// The \p Updater is called with the (presumable updated) current normalized
3640   /// induction variable and is expected to return the value that uses of the
3641   /// pre-updated induction values should use instead, typically dependent on
3642   /// the new induction variable. This is a lambda (instead of e.g. just passing
3643   /// the new value) to be able to distinguish the uses of the pre-updated
3644   /// induction variable and uses of the induction varible to compute the
3645   /// updated induction variable value.
3646   void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3647 
3648 public:
3649   /// Sets the last iteration variable for this loop.
setLastIter(Value * IterVar)3650   void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); }
3651 
3652   /// Returns the last iteration variable for this loop.
3653   /// Certain use-cases (like translation of linear clause) may access
3654   /// this variable even after a loop transformation. Hence, do not guard
3655   /// this getter function by `isValid`. It is the responsibility of the
3656   /// callee to ensure this functionality is not invoked by a non-outlined
3657   /// CanonicalLoopInfo object (in which case, `setLastIter` will never be
3658   /// invoked and `LastIter` will be by default `nullptr`).
getLastIter()3659   Value *getLastIter() { return LastIter; }
3660 
3661   /// Returns whether this object currently represents the IR of a loop. If
3662   /// returning false, it may have been consumed by a loop transformation or not
3663   /// been intialized. Do not use in this case;
isValid()3664   bool isValid() const { return Header; }
3665 
3666   /// The preheader ensures that there is only a single edge entering the loop.
3667   /// Code that must be execute before any loop iteration can be emitted here,
3668   /// such as computing the loop trip count and begin lifetime markers. Code in
3669   /// the preheader is not considered part of the canonical loop.
3670   LLVM_ABI BasicBlock *getPreheader() const;
3671 
3672   /// The header is the entry for each iteration. In the canonical control flow,
3673   /// it only contains the PHINode for the induction variable.
getHeader()3674   BasicBlock *getHeader() const {
3675     assert(isValid() && "Requires a valid canonical loop");
3676     return Header;
3677   }
3678 
3679   /// The condition block computes whether there is another loop iteration. If
3680   /// yes, branches to the body; otherwise to the exit block.
getCond()3681   BasicBlock *getCond() const {
3682     assert(isValid() && "Requires a valid canonical loop");
3683     return Cond;
3684   }
3685 
3686   /// The body block is the single entry for a loop iteration and not controlled
3687   /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3688   /// eventually branch to the \p Latch block.
getBody()3689   BasicBlock *getBody() const {
3690     assert(isValid() && "Requires a valid canonical loop");
3691     return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3692   }
3693 
3694   /// Reaching the latch indicates the end of the loop body code. In the
3695   /// canonical control flow, it only contains the increment of the induction
3696   /// variable.
getLatch()3697   BasicBlock *getLatch() const {
3698     assert(isValid() && "Requires a valid canonical loop");
3699     return Latch;
3700   }
3701 
3702   /// Reaching the exit indicates no more iterations are being executed.
getExit()3703   BasicBlock *getExit() const {
3704     assert(isValid() && "Requires a valid canonical loop");
3705     return Exit;
3706   }
3707 
3708   /// The after block is intended for clean-up code such as lifetime end
3709   /// markers. It is separate from the exit block to ensure, analogous to the
3710   /// preheader, it having just a single entry edge and being free from PHI
3711   /// nodes should there be multiple loop exits (such as from break
3712   /// statements/cancellations).
getAfter()3713   BasicBlock *getAfter() const {
3714     assert(isValid() && "Requires a valid canonical loop");
3715     return Exit->getSingleSuccessor();
3716   }
3717 
3718   /// Returns the llvm::Value containing the number of loop iterations. It must
3719   /// be valid in the preheader and always interpreted as an unsigned integer of
3720   /// any bit-width.
getTripCount()3721   Value *getTripCount() const {
3722     assert(isValid() && "Requires a valid canonical loop");
3723     Instruction *CmpI = &Cond->front();
3724     assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3725     return CmpI->getOperand(1);
3726   }
3727 
3728   /// Returns the instruction representing the current logical induction
3729   /// variable. Always unsigned, always starting at 0 with an increment of one.
getIndVar()3730   Instruction *getIndVar() const {
3731     assert(isValid() && "Requires a valid canonical loop");
3732     Instruction *IndVarPHI = &Header->front();
3733     assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3734     return IndVarPHI;
3735   }
3736 
3737   /// Return the type of the induction variable (and the trip count).
getIndVarType()3738   Type *getIndVarType() const {
3739     assert(isValid() && "Requires a valid canonical loop");
3740     return getIndVar()->getType();
3741   }
3742 
3743   /// Return the insertion point for user code before the loop.
getPreheaderIP()3744   OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
3745     assert(isValid() && "Requires a valid canonical loop");
3746     BasicBlock *Preheader = getPreheader();
3747     return {Preheader, std::prev(Preheader->end())};
3748   };
3749 
3750   /// Return the insertion point for user code in the body.
getBodyIP()3751   OpenMPIRBuilder::InsertPointTy getBodyIP() const {
3752     assert(isValid() && "Requires a valid canonical loop");
3753     BasicBlock *Body = getBody();
3754     return {Body, Body->begin()};
3755   };
3756 
3757   /// Return the insertion point for user code after the loop.
getAfterIP()3758   OpenMPIRBuilder::InsertPointTy getAfterIP() const {
3759     assert(isValid() && "Requires a valid canonical loop");
3760     BasicBlock *After = getAfter();
3761     return {After, After->begin()};
3762   };
3763 
getFunction()3764   Function *getFunction() const {
3765     assert(isValid() && "Requires a valid canonical loop");
3766     return Header->getParent();
3767   }
3768 
3769   /// Consistency self-check.
3770   LLVM_ABI void assertOK() const;
3771 
3772   /// Invalidate this loop. That is, the underlying IR does not fulfill the
3773   /// requirements of an OpenMP canonical loop anymore.
3774   LLVM_ABI void invalidate();
3775 };
3776 
3777 } // end namespace llvm
3778 
3779 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
3780