xref: /freebsd/contrib/llvm-project/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (revision 9bc300465e48e19d794d88d0c158a2adb92c7197)
1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements the OpenMPIRBuilder class, which is used as a
11 /// convenient way to create LLVM instructions for OpenMP directives.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
16 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/ADT/StringExtras.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/CodeMetrics.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Bitcode/BitcodeReader.h"
26 #include "llvm/Frontend/Offloading/Utility.h"
27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CFG.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DebugInfoMetadata.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/GlobalVariable.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/LLVMContext.h"
40 #include "llvm/IR/MDBuilder.h"
41 #include "llvm/IR/Metadata.h"
42 #include "llvm/IR/PassManager.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/MC/TargetRegistry.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include "llvm/Target/TargetOptions.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/LoopPeel.h"
54 #include "llvm/Transforms/Utils/UnrollLoop.h"
55 
56 #include <cstdint>
57 #include <optional>
58 
59 #define DEBUG_TYPE "openmp-ir-builder"
60 
61 using namespace llvm;
62 using namespace omp;
63 
64 static cl::opt<bool>
65     OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
66                          cl::desc("Use optimistic attributes describing "
67                                   "'as-if' properties of runtime calls."),
68                          cl::init(false));
69 
70 static cl::opt<double> UnrollThresholdFactor(
71     "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
72     cl::desc("Factor for the unroll threshold to account for code "
73              "simplifications still taking place"),
74     cl::init(1.5));
75 
76 #ifndef NDEBUG
77 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
78 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
79 /// an InsertPoint stores the instruction before something is inserted. For
80 /// instance, if both point to the same instruction, two IRBuilders alternating
81 /// creating instruction will cause the instructions to be interleaved.
82 static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
83                          IRBuilder<>::InsertPoint IP2) {
84   if (!IP1.isSet() || !IP2.isSet())
85     return false;
86   return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
87 }
88 
89 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
90   // Valid ordered/unordered and base algorithm combinations.
91   switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
92   case OMPScheduleType::UnorderedStaticChunked:
93   case OMPScheduleType::UnorderedStatic:
94   case OMPScheduleType::UnorderedDynamicChunked:
95   case OMPScheduleType::UnorderedGuidedChunked:
96   case OMPScheduleType::UnorderedRuntime:
97   case OMPScheduleType::UnorderedAuto:
98   case OMPScheduleType::UnorderedTrapezoidal:
99   case OMPScheduleType::UnorderedGreedy:
100   case OMPScheduleType::UnorderedBalanced:
101   case OMPScheduleType::UnorderedGuidedIterativeChunked:
102   case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
103   case OMPScheduleType::UnorderedSteal:
104   case OMPScheduleType::UnorderedStaticBalancedChunked:
105   case OMPScheduleType::UnorderedGuidedSimd:
106   case OMPScheduleType::UnorderedRuntimeSimd:
107   case OMPScheduleType::OrderedStaticChunked:
108   case OMPScheduleType::OrderedStatic:
109   case OMPScheduleType::OrderedDynamicChunked:
110   case OMPScheduleType::OrderedGuidedChunked:
111   case OMPScheduleType::OrderedRuntime:
112   case OMPScheduleType::OrderedAuto:
113   case OMPScheduleType::OrderdTrapezoidal:
114   case OMPScheduleType::NomergeUnorderedStaticChunked:
115   case OMPScheduleType::NomergeUnorderedStatic:
116   case OMPScheduleType::NomergeUnorderedDynamicChunked:
117   case OMPScheduleType::NomergeUnorderedGuidedChunked:
118   case OMPScheduleType::NomergeUnorderedRuntime:
119   case OMPScheduleType::NomergeUnorderedAuto:
120   case OMPScheduleType::NomergeUnorderedTrapezoidal:
121   case OMPScheduleType::NomergeUnorderedGreedy:
122   case OMPScheduleType::NomergeUnorderedBalanced:
123   case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
124   case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
125   case OMPScheduleType::NomergeUnorderedSteal:
126   case OMPScheduleType::NomergeOrderedStaticChunked:
127   case OMPScheduleType::NomergeOrderedStatic:
128   case OMPScheduleType::NomergeOrderedDynamicChunked:
129   case OMPScheduleType::NomergeOrderedGuidedChunked:
130   case OMPScheduleType::NomergeOrderedRuntime:
131   case OMPScheduleType::NomergeOrderedAuto:
132   case OMPScheduleType::NomergeOrderedTrapezoidal:
133     break;
134   default:
135     return false;
136   }
137 
138   // Must not set both monotonicity modifiers at the same time.
139   OMPScheduleType MonotonicityFlags =
140       SchedType & OMPScheduleType::MonotonicityMask;
141   if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
142     return false;
143 
144   return true;
145 }
146 #endif
147 
148 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
149   if (T.isAMDGPU()) {
150     StringRef Features =
151         Kernel->getFnAttribute("target-features").getValueAsString();
152     if (Features.count("+wavefrontsize64"))
153       return omp::getAMDGPUGridValues<64>();
154     return omp::getAMDGPUGridValues<32>();
155   }
156   if (T.isNVPTX())
157     return omp::NVPTXGridValues;
158   llvm_unreachable("No grid value available for this architecture!");
159 }
160 
161 /// Determine which scheduling algorithm to use, determined from schedule clause
162 /// arguments.
163 static OMPScheduleType
164 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
165                           bool HasSimdModifier) {
166   // Currently, the default schedule it static.
167   switch (ClauseKind) {
168   case OMP_SCHEDULE_Default:
169   case OMP_SCHEDULE_Static:
170     return HasChunks ? OMPScheduleType::BaseStaticChunked
171                      : OMPScheduleType::BaseStatic;
172   case OMP_SCHEDULE_Dynamic:
173     return OMPScheduleType::BaseDynamicChunked;
174   case OMP_SCHEDULE_Guided:
175     return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
176                            : OMPScheduleType::BaseGuidedChunked;
177   case OMP_SCHEDULE_Auto:
178     return llvm::omp::OMPScheduleType::BaseAuto;
179   case OMP_SCHEDULE_Runtime:
180     return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
181                            : OMPScheduleType::BaseRuntime;
182   }
183   llvm_unreachable("unhandled schedule clause argument");
184 }
185 
186 /// Adds ordering modifier flags to schedule type.
187 static OMPScheduleType
188 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType,
189                               bool HasOrderedClause) {
190   assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
191              OMPScheduleType::None &&
192          "Must not have ordering nor monotonicity flags already set");
193 
194   OMPScheduleType OrderingModifier = HasOrderedClause
195                                          ? OMPScheduleType::ModifierOrdered
196                                          : OMPScheduleType::ModifierUnordered;
197   OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
198 
199   // Unsupported combinations
200   if (OrderingScheduleType ==
201       (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
202     return OMPScheduleType::OrderedGuidedChunked;
203   else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
204                                     OMPScheduleType::ModifierOrdered))
205     return OMPScheduleType::OrderedRuntime;
206 
207   return OrderingScheduleType;
208 }
209 
210 /// Adds monotonicity modifier flags to schedule type.
211 static OMPScheduleType
212 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType,
213                                   bool HasSimdModifier, bool HasMonotonic,
214                                   bool HasNonmonotonic, bool HasOrderedClause) {
215   assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
216              OMPScheduleType::None &&
217          "Must not have monotonicity flags already set");
218   assert((!HasMonotonic || !HasNonmonotonic) &&
219          "Monotonic and Nonmonotonic are contradicting each other");
220 
221   if (HasMonotonic) {
222     return ScheduleType | OMPScheduleType::ModifierMonotonic;
223   } else if (HasNonmonotonic) {
224     return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
225   } else {
226     // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
227     // If the static schedule kind is specified or if the ordered clause is
228     // specified, and if the nonmonotonic modifier is not specified, the
229     // effect is as if the monotonic modifier is specified. Otherwise, unless
230     // the monotonic modifier is specified, the effect is as if the
231     // nonmonotonic modifier is specified.
232     OMPScheduleType BaseScheduleType =
233         ScheduleType & ~OMPScheduleType::ModifierMask;
234     if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
235         (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
236         HasOrderedClause) {
237       // The monotonic is used by default in openmp runtime library, so no need
238       // to set it.
239       return ScheduleType;
240     } else {
241       return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
242     }
243   }
244 }
245 
246 /// Determine the schedule type using schedule and ordering clause arguments.
247 static OMPScheduleType
248 computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
249                           bool HasSimdModifier, bool HasMonotonicModifier,
250                           bool HasNonmonotonicModifier, bool HasOrderedClause) {
251   OMPScheduleType BaseSchedule =
252       getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
253   OMPScheduleType OrderedSchedule =
254       getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
255   OMPScheduleType Result = getOpenMPMonotonicityScheduleType(
256       OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
257       HasNonmonotonicModifier, HasOrderedClause);
258 
259   assert(isValidWorkshareLoopScheduleType(Result));
260   return Result;
261 }
262 
263 /// Make \p Source branch to \p Target.
264 ///
265 /// Handles two situations:
266 /// * \p Source already has an unconditional branch.
267 /// * \p Source is a degenerate block (no terminator because the BB is
268 ///             the current head of the IR construction).
269 static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) {
270   if (Instruction *Term = Source->getTerminator()) {
271     auto *Br = cast<BranchInst>(Term);
272     assert(!Br->isConditional() &&
273            "BB's terminator must be an unconditional branch (or degenerate)");
274     BasicBlock *Succ = Br->getSuccessor(0);
275     Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
276     Br->setSuccessor(0, Target);
277     return;
278   }
279 
280   auto *NewBr = BranchInst::Create(Target, Source);
281   NewBr->setDebugLoc(DL);
282 }
283 
284 void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
285                     bool CreateBranch) {
286   assert(New->getFirstInsertionPt() == New->begin() &&
287          "Target BB must not have PHI nodes");
288 
289   // Move instructions to new block.
290   BasicBlock *Old = IP.getBlock();
291   New->splice(New->begin(), Old, IP.getPoint(), Old->end());
292 
293   if (CreateBranch)
294     BranchInst::Create(New, Old);
295 }
296 
297 void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
298   DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
299   BasicBlock *Old = Builder.GetInsertBlock();
300 
301   spliceBB(Builder.saveIP(), New, CreateBranch);
302   if (CreateBranch)
303     Builder.SetInsertPoint(Old->getTerminator());
304   else
305     Builder.SetInsertPoint(Old);
306 
307   // SetInsertPoint also updates the Builder's debug location, but we want to
308   // keep the one the Builder was configured to use.
309   Builder.SetCurrentDebugLocation(DebugLoc);
310 }
311 
312 BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
313                           llvm::Twine Name) {
314   BasicBlock *Old = IP.getBlock();
315   BasicBlock *New = BasicBlock::Create(
316       Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
317       Old->getParent(), Old->getNextNode());
318   spliceBB(IP, New, CreateBranch);
319   New->replaceSuccessorsPhiUsesWith(Old, New);
320   return New;
321 }
322 
323 BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
324                           llvm::Twine Name) {
325   DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
326   BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
327   if (CreateBranch)
328     Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
329   else
330     Builder.SetInsertPoint(Builder.GetInsertBlock());
331   // SetInsertPoint also updates the Builder's debug location, but we want to
332   // keep the one the Builder was configured to use.
333   Builder.SetCurrentDebugLocation(DebugLoc);
334   return New;
335 }
336 
337 BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
338                           llvm::Twine Name) {
339   DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
340   BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
341   if (CreateBranch)
342     Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
343   else
344     Builder.SetInsertPoint(Builder.GetInsertBlock());
345   // SetInsertPoint also updates the Builder's debug location, but we want to
346   // keep the one the Builder was configured to use.
347   Builder.SetCurrentDebugLocation(DebugLoc);
348   return New;
349 }
350 
351 BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
352                                     llvm::Twine Suffix) {
353   BasicBlock *Old = Builder.GetInsertBlock();
354   return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
355 }
356 
357 // This function creates a fake integer value and a fake use for the integer
358 // value. It returns the fake value created. This is useful in modeling the
359 // extra arguments to the outlined functions.
360 Value *createFakeIntVal(IRBuilder<> &Builder,
361                         OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
362                         std::stack<Instruction *> &ToBeDeleted,
363                         OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
364                         const Twine &Name = "", bool AsPtr = true) {
365   Builder.restoreIP(OuterAllocaIP);
366   Instruction *FakeVal;
367   AllocaInst *FakeValAddr =
368       Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
369   ToBeDeleted.push(FakeValAddr);
370 
371   if (AsPtr) {
372     FakeVal = FakeValAddr;
373   } else {
374     FakeVal =
375         Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
376     ToBeDeleted.push(FakeVal);
377   }
378 
379   // Generate a fake use of this value
380   Builder.restoreIP(InnerAllocaIP);
381   Instruction *UseFakeVal;
382   if (AsPtr) {
383     UseFakeVal =
384         Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
385   } else {
386     UseFakeVal =
387         cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
388   }
389   ToBeDeleted.push(UseFakeVal);
390   return FakeVal;
391 }
392 
393 //===----------------------------------------------------------------------===//
394 // OpenMPIRBuilderConfig
395 //===----------------------------------------------------------------------===//
396 
397 namespace {
398 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
399 /// Values for bit flags for marking which requires clauses have been used.
400 enum OpenMPOffloadingRequiresDirFlags {
401   /// flag undefined.
402   OMP_REQ_UNDEFINED = 0x000,
403   /// no requires directive present.
404   OMP_REQ_NONE = 0x001,
405   /// reverse_offload clause.
406   OMP_REQ_REVERSE_OFFLOAD = 0x002,
407   /// unified_address clause.
408   OMP_REQ_UNIFIED_ADDRESS = 0x004,
409   /// unified_shared_memory clause.
410   OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
411   /// dynamic_allocators clause.
412   OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
413   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
414 };
415 
416 } // anonymous namespace
417 
418 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
419     : RequiresFlags(OMP_REQ_UNDEFINED) {}
420 
421 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
422     bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
423     bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
424     bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
425     : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
426       OpenMPOffloadMandatory(OpenMPOffloadMandatory),
427       RequiresFlags(OMP_REQ_UNDEFINED) {
428   if (HasRequiresReverseOffload)
429     RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
430   if (HasRequiresUnifiedAddress)
431     RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
432   if (HasRequiresUnifiedSharedMemory)
433     RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
434   if (HasRequiresDynamicAllocators)
435     RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
436 }
437 
438 bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
439   return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
440 }
441 
442 bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
443   return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
444 }
445 
446 bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
447   return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
448 }
449 
450 bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
451   return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
452 }
453 
454 int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
455   return hasRequiresFlags() ? RequiresFlags
456                             : static_cast<int64_t>(OMP_REQ_NONE);
457 }
458 
459 void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
460   if (Value)
461     RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
462   else
463     RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
464 }
465 
466 void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
467   if (Value)
468     RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
469   else
470     RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
471 }
472 
473 void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
474   if (Value)
475     RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
476   else
477     RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
478 }
479 
480 void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
481   if (Value)
482     RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
483   else
484     RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
485 }
486 
487 //===----------------------------------------------------------------------===//
488 // OpenMPIRBuilder
489 //===----------------------------------------------------------------------===//
490 
491 void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
492                                           IRBuilderBase &Builder,
493                                           SmallVector<Value *> &ArgsVector) {
494   Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
495   Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
496   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
497   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
498   Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
499 
500   Value *NumTeams3D =
501       Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
502   Value *NumThreads3D =
503       Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
504 
505   ArgsVector = {Version,
506                 PointerNum,
507                 KernelArgs.RTArgs.BasePointersArray,
508                 KernelArgs.RTArgs.PointersArray,
509                 KernelArgs.RTArgs.SizesArray,
510                 KernelArgs.RTArgs.MapTypesArray,
511                 KernelArgs.RTArgs.MapNamesArray,
512                 KernelArgs.RTArgs.MappersArray,
513                 KernelArgs.NumIterations,
514                 Flags,
515                 NumTeams3D,
516                 NumThreads3D,
517                 KernelArgs.DynCGGroupMem};
518 }
519 
520 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
521   LLVMContext &Ctx = Fn.getContext();
522 
523   // Get the function's current attributes.
524   auto Attrs = Fn.getAttributes();
525   auto FnAttrs = Attrs.getFnAttrs();
526   auto RetAttrs = Attrs.getRetAttrs();
527   SmallVector<AttributeSet, 4> ArgAttrs;
528   for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
529     ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
530 
531   // Add AS to FnAS while taking special care with integer extensions.
532   auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
533                         bool Param = true) -> void {
534     bool HasSignExt = AS.hasAttribute(Attribute::SExt);
535     bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
536     if (HasSignExt || HasZeroExt) {
537       assert(AS.getNumAttributes() == 1 &&
538              "Currently not handling extension attr combined with others.");
539       if (Param) {
540         if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
541           FnAS = FnAS.addAttribute(Ctx, AK);
542       } else if (auto AK =
543                      TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
544         FnAS = FnAS.addAttribute(Ctx, AK);
545     } else {
546       FnAS = FnAS.addAttributes(Ctx, AS);
547     }
548   };
549 
550 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
551 #include "llvm/Frontend/OpenMP/OMPKinds.def"
552 
553   // Add attributes to the function declaration.
554   switch (FnID) {
555 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets)                \
556   case Enum:                                                                   \
557     FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet);                           \
558     addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false);                         \
559     for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo)                \
560       addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]);                         \
561     Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs));    \
562     break;
563 #include "llvm/Frontend/OpenMP/OMPKinds.def"
564   default:
565     // Attributes are optional.
566     break;
567   }
568 }
569 
570 FunctionCallee
571 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
572   FunctionType *FnTy = nullptr;
573   Function *Fn = nullptr;
574 
575   // Try to find the declation in the module first.
576   switch (FnID) {
577 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...)                          \
578   case Enum:                                                                   \
579     FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__},        \
580                              IsVarArg);                                        \
581     Fn = M.getFunction(Str);                                                   \
582     break;
583 #include "llvm/Frontend/OpenMP/OMPKinds.def"
584   }
585 
586   if (!Fn) {
587     // Create a new declaration if we need one.
588     switch (FnID) {
589 #define OMP_RTL(Enum, Str, ...)                                                \
590   case Enum:                                                                   \
591     Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M);         \
592     break;
593 #include "llvm/Frontend/OpenMP/OMPKinds.def"
594     }
595 
596     // Add information if the runtime function takes a callback function
597     if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
598       if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
599         LLVMContext &Ctx = Fn->getContext();
600         MDBuilder MDB(Ctx);
601         // Annotate the callback behavior of the runtime function:
602         //  - The callback callee is argument number 2 (microtask).
603         //  - The first two arguments of the callback callee are unknown (-1).
604         //  - All variadic arguments to the runtime function are passed to the
605         //    callback callee.
606         Fn->addMetadata(
607             LLVMContext::MD_callback,
608             *MDNode::get(Ctx, {MDB.createCallbackEncoding(
609                                   2, {-1, -1}, /* VarArgsArePassed */ true)}));
610       }
611     }
612 
613     LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
614                       << " with type " << *Fn->getFunctionType() << "\n");
615     addAttributes(FnID, *Fn);
616 
617   } else {
618     LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
619                       << " with type " << *Fn->getFunctionType() << "\n");
620   }
621 
622   assert(Fn && "Failed to create OpenMP runtime function");
623 
624   return {FnTy, Fn};
625 }
626 
627 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
628   FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
629   auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
630   assert(Fn && "Failed to create OpenMP runtime function pointer");
631   return Fn;
632 }
633 
634 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
635 
636 void OpenMPIRBuilder::finalize(Function *Fn) {
637   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
638   SmallVector<BasicBlock *, 32> Blocks;
639   SmallVector<OutlineInfo, 16> DeferredOutlines;
640   for (OutlineInfo &OI : OutlineInfos) {
641     // Skip functions that have not finalized yet; may happen with nested
642     // function generation.
643     if (Fn && OI.getFunction() != Fn) {
644       DeferredOutlines.push_back(OI);
645       continue;
646     }
647 
648     ParallelRegionBlockSet.clear();
649     Blocks.clear();
650     OI.collectBlocks(ParallelRegionBlockSet, Blocks);
651 
652     Function *OuterFn = OI.getFunction();
653     CodeExtractorAnalysisCache CEAC(*OuterFn);
654     // If we generate code for the target device, we need to allocate
655     // struct for aggregate params in the device default alloca address space.
656     // OpenMP runtime requires that the params of the extracted functions are
657     // passed as zero address space pointers. This flag ensures that
658     // CodeExtractor generates correct code for extracted functions
659     // which are used by OpenMP runtime.
660     bool ArgsInZeroAddressSpace = Config.isTargetDevice();
661     CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
662                             /* AggregateArgs */ true,
663                             /* BlockFrequencyInfo */ nullptr,
664                             /* BranchProbabilityInfo */ nullptr,
665                             /* AssumptionCache */ nullptr,
666                             /* AllowVarArgs */ true,
667                             /* AllowAlloca */ true,
668                             /* AllocaBlock*/ OI.OuterAllocaBB,
669                             /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
670 
671     LLVM_DEBUG(dbgs() << "Before     outlining: " << *OuterFn << "\n");
672     LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
673                       << " Exit: " << OI.ExitBB->getName() << "\n");
674     assert(Extractor.isEligible() &&
675            "Expected OpenMP outlining to be possible!");
676 
677     for (auto *V : OI.ExcludeArgsFromAggregate)
678       Extractor.excludeArgFromAggregate(V);
679 
680     Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
681 
682     LLVM_DEBUG(dbgs() << "After      outlining: " << *OuterFn << "\n");
683     LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
684     assert(OutlinedFn->getReturnType()->isVoidTy() &&
685            "OpenMP outlined functions should not return a value!");
686 
687     // For compability with the clang CG we move the outlined function after the
688     // one with the parallel region.
689     OutlinedFn->removeFromParent();
690     M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
691 
692     // Remove the artificial entry introduced by the extractor right away, we
693     // made our own entry block after all.
694     {
695       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
696       assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
697       assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
698       // Move instructions from the to-be-deleted ArtificialEntry to the entry
699       // basic block of the parallel region. CodeExtractor generates
700       // instructions to unwrap the aggregate argument and may sink
701       // allocas/bitcasts for values that are solely used in the outlined region
702       // and do not escape.
703       assert(!ArtificialEntry.empty() &&
704              "Expected instructions to add in the outlined region entry");
705       for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
706                                         End = ArtificialEntry.rend();
707            It != End;) {
708         Instruction &I = *It;
709         It++;
710 
711         if (I.isTerminator())
712           continue;
713 
714         I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
715       }
716 
717       OI.EntryBB->moveBefore(&ArtificialEntry);
718       ArtificialEntry.eraseFromParent();
719     }
720     assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
721     assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
722 
723     // Run a user callback, e.g. to add attributes.
724     if (OI.PostOutlineCB)
725       OI.PostOutlineCB(*OutlinedFn);
726   }
727 
728   // Remove work items that have been completed.
729   OutlineInfos = std::move(DeferredOutlines);
730 
731   EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
732       [](EmitMetadataErrorKind Kind,
733          const TargetRegionEntryInfo &EntryInfo) -> void {
734     errs() << "Error of kind: " << Kind
735            << " when emitting offload entries and metadata during "
736               "OMPIRBuilder finalization \n";
737   };
738 
739   if (!OffloadInfoManager.empty())
740     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
741 }
742 
743 OpenMPIRBuilder::~OpenMPIRBuilder() {
744   assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
745 }
746 
747 GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
748   IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
749   auto *GV =
750       new GlobalVariable(M, I32Ty,
751                          /* isConstant = */ true, GlobalValue::WeakODRLinkage,
752                          ConstantInt::get(I32Ty, Value), Name);
753   GV->setVisibility(GlobalValue::HiddenVisibility);
754 
755   return GV;
756 }
757 
758 Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
759                                             uint32_t SrcLocStrSize,
760                                             IdentFlag LocFlags,
761                                             unsigned Reserve2Flags) {
762   // Enable "C-mode".
763   LocFlags |= OMP_IDENT_FLAG_KMPC;
764 
765   Constant *&Ident =
766       IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
767   if (!Ident) {
768     Constant *I32Null = ConstantInt::getNullValue(Int32);
769     Constant *IdentData[] = {I32Null,
770                              ConstantInt::get(Int32, uint32_t(LocFlags)),
771                              ConstantInt::get(Int32, Reserve2Flags),
772                              ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
773     Constant *Initializer =
774         ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
775 
776     // Look for existing encoding of the location + flags, not needed but
777     // minimizes the difference to the existing solution while we transition.
778     for (GlobalVariable &GV : M.globals())
779       if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
780         if (GV.getInitializer() == Initializer)
781           Ident = &GV;
782 
783     if (!Ident) {
784       auto *GV = new GlobalVariable(
785           M, OpenMPIRBuilder::Ident,
786           /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
787           nullptr, GlobalValue::NotThreadLocal,
788           M.getDataLayout().getDefaultGlobalsAddressSpace());
789       GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
790       GV->setAlignment(Align(8));
791       Ident = GV;
792     }
793   }
794 
795   return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
796 }
797 
798 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
799                                                 uint32_t &SrcLocStrSize) {
800   SrcLocStrSize = LocStr.size();
801   Constant *&SrcLocStr = SrcLocStrMap[LocStr];
802   if (!SrcLocStr) {
803     Constant *Initializer =
804         ConstantDataArray::getString(M.getContext(), LocStr);
805 
806     // Look for existing encoding of the location, not needed but minimizes the
807     // difference to the existing solution while we transition.
808     for (GlobalVariable &GV : M.globals())
809       if (GV.isConstant() && GV.hasInitializer() &&
810           GV.getInitializer() == Initializer)
811         return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
812 
813     SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
814                                               /* AddressSpace */ 0, &M);
815   }
816   return SrcLocStr;
817 }
818 
819 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
820                                                 StringRef FileName,
821                                                 unsigned Line, unsigned Column,
822                                                 uint32_t &SrcLocStrSize) {
823   SmallString<128> Buffer;
824   Buffer.push_back(';');
825   Buffer.append(FileName);
826   Buffer.push_back(';');
827   Buffer.append(FunctionName);
828   Buffer.push_back(';');
829   Buffer.append(std::to_string(Line));
830   Buffer.push_back(';');
831   Buffer.append(std::to_string(Column));
832   Buffer.push_back(';');
833   Buffer.push_back(';');
834   return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
835 }
836 
837 Constant *
838 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
839   StringRef UnknownLoc = ";unknown;unknown;0;0;;";
840   return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
841 }
842 
843 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
844                                                 uint32_t &SrcLocStrSize,
845                                                 Function *F) {
846   DILocation *DIL = DL.get();
847   if (!DIL)
848     return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
849   StringRef FileName = M.getName();
850   if (DIFile *DIF = DIL->getFile())
851     if (std::optional<StringRef> Source = DIF->getSource())
852       FileName = *Source;
853   StringRef Function = DIL->getScope()->getSubprogram()->getName();
854   if (Function.empty() && F)
855     Function = F->getName();
856   return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
857                               DIL->getColumn(), SrcLocStrSize);
858 }
859 
860 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
861                                                 uint32_t &SrcLocStrSize) {
862   return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
863                               Loc.IP.getBlock()->getParent());
864 }
865 
866 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
867   return Builder.CreateCall(
868       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
869       "omp_global_thread_num");
870 }
871 
872 OpenMPIRBuilder::InsertPointTy
873 OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
874                                bool ForceSimpleCall, bool CheckCancelFlag) {
875   if (!updateToLocation(Loc))
876     return Loc.IP;
877   return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
878 }
879 
880 OpenMPIRBuilder::InsertPointTy
881 OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
882                                  bool ForceSimpleCall, bool CheckCancelFlag) {
883   // Build call __kmpc_cancel_barrier(loc, thread_id) or
884   //            __kmpc_barrier(loc, thread_id);
885 
886   IdentFlag BarrierLocFlags;
887   switch (Kind) {
888   case OMPD_for:
889     BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
890     break;
891   case OMPD_sections:
892     BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
893     break;
894   case OMPD_single:
895     BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
896     break;
897   case OMPD_barrier:
898     BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
899     break;
900   default:
901     BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
902     break;
903   }
904 
905   uint32_t SrcLocStrSize;
906   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
907   Value *Args[] = {
908       getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
909       getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
910 
911   // If we are in a cancellable parallel region, barriers are cancellation
912   // points.
913   // TODO: Check why we would force simple calls or to ignore the cancel flag.
914   bool UseCancelBarrier =
915       !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
916 
917   Value *Result =
918       Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
919                              UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
920                                               : OMPRTL___kmpc_barrier),
921                          Args);
922 
923   if (UseCancelBarrier && CheckCancelFlag)
924     emitCancelationCheckImpl(Result, OMPD_parallel);
925 
926   return Builder.saveIP();
927 }
928 
929 OpenMPIRBuilder::InsertPointTy
930 OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
931                               Value *IfCondition,
932                               omp::Directive CanceledDirective) {
933   if (!updateToLocation(Loc))
934     return Loc.IP;
935 
936   // LLVM utilities like blocks with terminators.
937   auto *UI = Builder.CreateUnreachable();
938 
939   Instruction *ThenTI = UI, *ElseTI = nullptr;
940   if (IfCondition)
941     SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
942   Builder.SetInsertPoint(ThenTI);
943 
944   Value *CancelKind = nullptr;
945   switch (CanceledDirective) {
946 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value)                       \
947   case DirectiveEnum:                                                          \
948     CancelKind = Builder.getInt32(Value);                                      \
949     break;
950 #include "llvm/Frontend/OpenMP/OMPKinds.def"
951   default:
952     llvm_unreachable("Unknown cancel kind!");
953   }
954 
955   uint32_t SrcLocStrSize;
956   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
957   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
958   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
959   Value *Result = Builder.CreateCall(
960       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
961   auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
962     if (CanceledDirective == OMPD_parallel) {
963       IRBuilder<>::InsertPointGuard IPG(Builder);
964       Builder.restoreIP(IP);
965       createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
966                     omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
967                     /* CheckCancelFlag */ false);
968     }
969   };
970 
971   // The actual cancel logic is shared with others, e.g., cancel_barriers.
972   emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
973 
974   // Update the insertion point and remove the terminator we introduced.
975   Builder.SetInsertPoint(UI->getParent());
976   UI->eraseFromParent();
977 
978   return Builder.saveIP();
979 }
980 
981 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
982     const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
983     Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
984     Value *HostPtr, ArrayRef<Value *> KernelArgs) {
985   if (!updateToLocation(Loc))
986     return Loc.IP;
987 
988   Builder.restoreIP(AllocaIP);
989   auto *KernelArgsPtr =
990       Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
991   Builder.restoreIP(Loc.IP);
992 
993   for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
994     llvm::Value *Arg =
995         Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
996     Builder.CreateAlignedStore(
997         KernelArgs[I], Arg,
998         M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
999   }
1000 
1001   SmallVector<Value *> OffloadingArgs{Ident,      DeviceID, NumTeams,
1002                                       NumThreads, HostPtr,  KernelArgsPtr};
1003 
1004   Return = Builder.CreateCall(
1005       getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1006       OffloadingArgs);
1007 
1008   return Builder.saveIP();
1009 }
1010 
1011 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitKernelLaunch(
1012     const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1013     EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1014     Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1015 
1016   if (!updateToLocation(Loc))
1017     return Loc.IP;
1018 
1019   Builder.restoreIP(Loc.IP);
1020   // On top of the arrays that were filled up, the target offloading call
1021   // takes as arguments the device id as well as the host pointer. The host
1022   // pointer is used by the runtime library to identify the current target
1023   // region, so it only has to be unique and not necessarily point to
1024   // anything. It could be the pointer to the outlined function that
1025   // implements the target region, but we aren't using that so that the
1026   // compiler doesn't need to keep that, and could therefore inline the host
1027   // function if proven worthwhile during optimization.
1028 
1029   // From this point on, we need to have an ID of the target region defined.
1030   assert(OutlinedFnID && "Invalid outlined function ID!");
1031   (void)OutlinedFnID;
1032 
1033   // Return value of the runtime offloading call.
1034   Value *Return = nullptr;
1035 
1036   // Arguments for the target kernel.
1037   SmallVector<Value *> ArgsVector;
1038   getKernelArgsVector(Args, Builder, ArgsVector);
1039 
1040   // The target region is an outlined function launched by the runtime
1041   // via calls to __tgt_target_kernel().
1042   //
1043   // Note that on the host and CPU targets, the runtime implementation of
1044   // these calls simply call the outlined function without forking threads.
1045   // The outlined functions themselves have runtime calls to
1046   // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1047   // the compiler in emitTeamsCall() and emitParallelCall().
1048   //
1049   // In contrast, on the NVPTX target, the implementation of
1050   // __tgt_target_teams() launches a GPU kernel with the requested number
1051   // of teams and threads so no additional calls to the runtime are required.
1052   // Check the error code and execute the host version if required.
1053   Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1054                                      Args.NumTeams, Args.NumThreads,
1055                                      OutlinedFnID, ArgsVector));
1056 
1057   BasicBlock *OffloadFailedBlock =
1058       BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1059   BasicBlock *OffloadContBlock =
1060       BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1061   Value *Failed = Builder.CreateIsNotNull(Return);
1062   Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1063 
1064   auto CurFn = Builder.GetInsertBlock()->getParent();
1065   emitBlock(OffloadFailedBlock, CurFn);
1066   Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1067   emitBranch(OffloadContBlock);
1068   emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1069   return Builder.saveIP();
1070 }
1071 
1072 void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag,
1073                                                omp::Directive CanceledDirective,
1074                                                FinalizeCallbackTy ExitCB) {
1075   assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1076          "Unexpected cancellation!");
1077 
1078   // For a cancel barrier we create two new blocks.
1079   BasicBlock *BB = Builder.GetInsertBlock();
1080   BasicBlock *NonCancellationBlock;
1081   if (Builder.GetInsertPoint() == BB->end()) {
1082     // TODO: This branch will not be needed once we moved to the
1083     // OpenMPIRBuilder codegen completely.
1084     NonCancellationBlock = BasicBlock::Create(
1085         BB->getContext(), BB->getName() + ".cont", BB->getParent());
1086   } else {
1087     NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1088     BB->getTerminator()->eraseFromParent();
1089     Builder.SetInsertPoint(BB);
1090   }
1091   BasicBlock *CancellationBlock = BasicBlock::Create(
1092       BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1093 
1094   // Jump to them based on the return value.
1095   Value *Cmp = Builder.CreateIsNull(CancelFlag);
1096   Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1097                        /* TODO weight */ nullptr, nullptr);
1098 
1099   // From the cancellation block we finalize all variables and go to the
1100   // post finalization block that is known to the FiniCB callback.
1101   Builder.SetInsertPoint(CancellationBlock);
1102   if (ExitCB)
1103     ExitCB(Builder.saveIP());
1104   auto &FI = FinalizationStack.back();
1105   FI.FiniCB(Builder.saveIP());
1106 
1107   // The continuation block is where code generation continues.
1108   Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1109 }
1110 
1111 // Callback used to create OpenMP runtime calls to support
1112 // omp parallel clause for the device.
1113 // We need to use this callback to replace call to the OutlinedFn in OuterFn
1114 // by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1115 static void targetParallelCallback(
1116     OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1117     BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1118     Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1119     Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1120   // Add some known attributes.
1121   IRBuilder<> &Builder = OMPIRBuilder->Builder;
1122   OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1123   OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1124   OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1125   OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1126   OutlinedFn.addFnAttr(Attribute::NoUnwind);
1127 
1128   assert(OutlinedFn.arg_size() >= 2 &&
1129          "Expected at least tid and bounded tid as arguments");
1130   unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1131 
1132   CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1133   assert(CI && "Expected call instruction to outlined function");
1134   CI->getParent()->setName("omp_parallel");
1135 
1136   Builder.SetInsertPoint(CI);
1137   Type *PtrTy = OMPIRBuilder->VoidPtr;
1138   Value *NullPtrValue = Constant::getNullValue(PtrTy);
1139 
1140   // Add alloca for kernel args
1141   OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1142   Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1143   AllocaInst *ArgsAlloca =
1144       Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1145   Value *Args = ArgsAlloca;
1146   // Add address space cast if array for storing arguments is not allocated
1147   // in address space 0
1148   if (ArgsAlloca->getAddressSpace())
1149     Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1150   Builder.restoreIP(CurrentIP);
1151 
1152   // Store captured vars which are used by kmpc_parallel_51
1153   for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1154     Value *V = *(CI->arg_begin() + 2 + Idx);
1155     Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1156         ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1157     Builder.CreateStore(V, StoreAddress);
1158   }
1159 
1160   Value *Cond =
1161       IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1162                   : Builder.getInt32(1);
1163 
1164   // Build kmpc_parallel_51 call
1165   Value *Parallel51CallArgs[] = {
1166       /* identifier*/ Ident,
1167       /* global thread num*/ ThreadID,
1168       /* if expression */ Cond,
1169       /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1170       /* Proc bind */ Builder.getInt32(-1),
1171       /* outlined function */
1172       Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1173       /* wrapper function */ NullPtrValue,
1174       /* arguments of the outlined funciton*/ Args,
1175       /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1176 
1177   FunctionCallee RTLFn =
1178       OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1179 
1180   Builder.CreateCall(RTLFn, Parallel51CallArgs);
1181 
1182   LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1183                     << *Builder.GetInsertBlock()->getParent() << "\n");
1184 
1185   // Initialize the local TID stack location with the argument value.
1186   Builder.SetInsertPoint(PrivTID);
1187   Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1188   Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1189                       PrivTIDAddr);
1190 
1191   // Remove redundant call to the outlined function.
1192   CI->eraseFromParent();
1193 
1194   for (Instruction *I : ToBeDeleted) {
1195     I->eraseFromParent();
1196   }
1197 }
1198 
1199 // Callback used to create OpenMP runtime calls to support
1200 // omp parallel clause for the host.
1201 // We need to use this callback to replace call to the OutlinedFn in OuterFn
1202 // by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1203 static void
1204 hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1205                      Function *OuterFn, Value *Ident, Value *IfCondition,
1206                      Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1207                      const SmallVector<Instruction *, 4> &ToBeDeleted) {
1208   IRBuilder<> &Builder = OMPIRBuilder->Builder;
1209   FunctionCallee RTLFn;
1210   if (IfCondition) {
1211     RTLFn =
1212         OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1213   } else {
1214     RTLFn =
1215         OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1216   }
1217   if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1218     if (!F->hasMetadata(LLVMContext::MD_callback)) {
1219       LLVMContext &Ctx = F->getContext();
1220       MDBuilder MDB(Ctx);
1221       // Annotate the callback behavior of the __kmpc_fork_call:
1222       //  - The callback callee is argument number 2 (microtask).
1223       //  - The first two arguments of the callback callee are unknown (-1).
1224       //  - All variadic arguments to the __kmpc_fork_call are passed to the
1225       //    callback callee.
1226       F->addMetadata(LLVMContext::MD_callback,
1227                      *MDNode::get(Ctx, {MDB.createCallbackEncoding(
1228                                            2, {-1, -1},
1229                                            /* VarArgsArePassed */ true)}));
1230     }
1231   }
1232   // Add some known attributes.
1233   OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1234   OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1235   OutlinedFn.addFnAttr(Attribute::NoUnwind);
1236 
1237   assert(OutlinedFn.arg_size() >= 2 &&
1238          "Expected at least tid and bounded tid as arguments");
1239   unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1240 
1241   CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1242   CI->getParent()->setName("omp_parallel");
1243   Builder.SetInsertPoint(CI);
1244 
1245   // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1246   Value *ForkCallArgs[] = {
1247       Ident, Builder.getInt32(NumCapturedVars),
1248       Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1249 
1250   SmallVector<Value *, 16> RealArgs;
1251   RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1252   if (IfCondition) {
1253     Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1254     RealArgs.push_back(Cond);
1255   }
1256   RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1257 
1258   // __kmpc_fork_call_if always expects a void ptr as the last argument
1259   // If there are no arguments, pass a null pointer.
1260   auto PtrTy = OMPIRBuilder->VoidPtr;
1261   if (IfCondition && NumCapturedVars == 0) {
1262     Value *NullPtrValue = Constant::getNullValue(PtrTy);
1263     RealArgs.push_back(NullPtrValue);
1264   }
1265   if (IfCondition && RealArgs.back()->getType() != PtrTy)
1266     RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1267 
1268   Builder.CreateCall(RTLFn, RealArgs);
1269 
1270   LLVM_DEBUG(dbgs() << "With fork_call placed: "
1271                     << *Builder.GetInsertBlock()->getParent() << "\n");
1272 
1273   // Initialize the local TID stack location with the argument value.
1274   Builder.SetInsertPoint(PrivTID);
1275   Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1276   Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1277                       PrivTIDAddr);
1278 
1279   // Remove redundant call to the outlined function.
1280   CI->eraseFromParent();
1281 
1282   for (Instruction *I : ToBeDeleted) {
1283     I->eraseFromParent();
1284   }
1285 }
1286 
1287 IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
1288     const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1289     BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1290     FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1291     omp::ProcBindKind ProcBind, bool IsCancellable) {
1292   assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1293 
1294   if (!updateToLocation(Loc))
1295     return Loc.IP;
1296 
1297   uint32_t SrcLocStrSize;
1298   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1299   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1300   Value *ThreadID = getOrCreateThreadID(Ident);
1301   // If we generate code for the target device, we need to allocate
1302   // struct for aggregate params in the device default alloca address space.
1303   // OpenMP runtime requires that the params of the extracted functions are
1304   // passed as zero address space pointers. This flag ensures that extracted
1305   // function arguments are declared in zero address space
1306   bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1307 
1308   // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1309   // only if we compile for host side.
1310   if (NumThreads && !Config.isTargetDevice()) {
1311     Value *Args[] = {
1312         Ident, ThreadID,
1313         Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1314     Builder.CreateCall(
1315         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1316   }
1317 
1318   if (ProcBind != OMP_PROC_BIND_default) {
1319     // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1320     Value *Args[] = {
1321         Ident, ThreadID,
1322         ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1323     Builder.CreateCall(
1324         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1325   }
1326 
1327   BasicBlock *InsertBB = Builder.GetInsertBlock();
1328   Function *OuterFn = InsertBB->getParent();
1329 
1330   // Save the outer alloca block because the insertion iterator may get
1331   // invalidated and we still need this later.
1332   BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1333 
1334   // Vector to remember instructions we used only during the modeling but which
1335   // we want to delete at the end.
1336   SmallVector<Instruction *, 4> ToBeDeleted;
1337 
1338   // Change the location to the outer alloca insertion point to create and
1339   // initialize the allocas we pass into the parallel region.
1340   Builder.restoreIP(OuterAllocaIP);
1341   AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1342   AllocaInst *ZeroAddrAlloca =
1343       Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1344   Instruction *TIDAddr = TIDAddrAlloca;
1345   Instruction *ZeroAddr = ZeroAddrAlloca;
1346   if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1347     // Add additional casts to enforce pointers in zero address space
1348     TIDAddr = new AddrSpaceCastInst(
1349         TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1350     TIDAddr->insertAfter(TIDAddrAlloca);
1351     ToBeDeleted.push_back(TIDAddr);
1352     ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1353                                      PointerType ::get(M.getContext(), 0),
1354                                      "zero.addr.ascast");
1355     ZeroAddr->insertAfter(ZeroAddrAlloca);
1356     ToBeDeleted.push_back(ZeroAddr);
1357   }
1358 
1359   // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1360   // associated arguments in the outlined function, so we delete them later.
1361   ToBeDeleted.push_back(TIDAddrAlloca);
1362   ToBeDeleted.push_back(ZeroAddrAlloca);
1363 
1364   // Create an artificial insertion point that will also ensure the blocks we
1365   // are about to split are not degenerated.
1366   auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1367 
1368   BasicBlock *EntryBB = UI->getParent();
1369   BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1370   BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1371   BasicBlock *PRegPreFiniBB =
1372       PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1373   BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1374 
1375   auto FiniCBWrapper = [&](InsertPointTy IP) {
1376     // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1377     // target to the region exit block.
1378     if (IP.getBlock()->end() == IP.getPoint()) {
1379       IRBuilder<>::InsertPointGuard IPG(Builder);
1380       Builder.restoreIP(IP);
1381       Instruction *I = Builder.CreateBr(PRegExitBB);
1382       IP = InsertPointTy(I->getParent(), I->getIterator());
1383     }
1384     assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1385            IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1386            "Unexpected insertion point for finalization call!");
1387     return FiniCB(IP);
1388   };
1389 
1390   FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1391 
1392   // Generate the privatization allocas in the block that will become the entry
1393   // of the outlined function.
1394   Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1395   InsertPointTy InnerAllocaIP = Builder.saveIP();
1396 
1397   AllocaInst *PrivTIDAddr =
1398       Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1399   Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1400 
1401   // Add some fake uses for OpenMP provided arguments.
1402   ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1403   Instruction *ZeroAddrUse =
1404       Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1405   ToBeDeleted.push_back(ZeroAddrUse);
1406 
1407   // EntryBB
1408   //   |
1409   //   V
1410   // PRegionEntryBB         <- Privatization allocas are placed here.
1411   //   |
1412   //   V
1413   // PRegionBodyBB          <- BodeGen is invoked here.
1414   //   |
1415   //   V
1416   // PRegPreFiniBB          <- The block we will start finalization from.
1417   //   |
1418   //   V
1419   // PRegionExitBB          <- A common exit to simplify block collection.
1420   //
1421 
1422   LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1423 
1424   // Let the caller create the body.
1425   assert(BodyGenCB && "Expected body generation callback!");
1426   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1427   BodyGenCB(InnerAllocaIP, CodeGenIP);
1428 
1429   LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
1430 
1431   OutlineInfo OI;
1432   if (Config.isTargetDevice()) {
1433     // Generate OpenMP target specific runtime call
1434     OI.PostOutlineCB = [=, ToBeDeletedVec =
1435                                std::move(ToBeDeleted)](Function &OutlinedFn) {
1436       targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1437                              IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1438                              ThreadID, ToBeDeletedVec);
1439     };
1440   } else {
1441     // Generate OpenMP host runtime call
1442     OI.PostOutlineCB = [=, ToBeDeletedVec =
1443                                std::move(ToBeDeleted)](Function &OutlinedFn) {
1444       hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1445                            PrivTID, PrivTIDAddr, ToBeDeletedVec);
1446     };
1447   }
1448 
1449   // Adjust the finalization stack, verify the adjustment, and call the
1450   // finalize function a last time to finalize values between the pre-fini
1451   // block and the exit block if we left the parallel "the normal way".
1452   auto FiniInfo = FinalizationStack.pop_back_val();
1453   (void)FiniInfo;
1454   assert(FiniInfo.DK == OMPD_parallel &&
1455          "Unexpected finalization stack state!");
1456 
1457   Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1458 
1459   InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1460   FiniCB(PreFiniIP);
1461 
1462   OI.OuterAllocaBB = OuterAllocaBlock;
1463   OI.EntryBB = PRegEntryBB;
1464   OI.ExitBB = PRegExitBB;
1465 
1466   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1467   SmallVector<BasicBlock *, 32> Blocks;
1468   OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1469 
1470   // Ensure a single exit node for the outlined region by creating one.
1471   // We might have multiple incoming edges to the exit now due to finalizations,
1472   // e.g., cancel calls that cause the control flow to leave the region.
1473   BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1474   PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1475   PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1476   Blocks.push_back(PRegOutlinedExitBB);
1477 
1478   CodeExtractorAnalysisCache CEAC(*OuterFn);
1479   CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1480                           /* AggregateArgs */ false,
1481                           /* BlockFrequencyInfo */ nullptr,
1482                           /* BranchProbabilityInfo */ nullptr,
1483                           /* AssumptionCache */ nullptr,
1484                           /* AllowVarArgs */ true,
1485                           /* AllowAlloca */ true,
1486                           /* AllocationBlock */ OuterAllocaBlock,
1487                           /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1488 
1489   // Find inputs to, outputs from the code region.
1490   BasicBlock *CommonExit = nullptr;
1491   SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1492   Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1493   Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1494 
1495   LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1496 
1497   FunctionCallee TIDRTLFn =
1498       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1499 
1500   auto PrivHelper = [&](Value &V) {
1501     if (&V == TIDAddr || &V == ZeroAddr) {
1502       OI.ExcludeArgsFromAggregate.push_back(&V);
1503       return;
1504     }
1505 
1506     SetVector<Use *> Uses;
1507     for (Use &U : V.uses())
1508       if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1509         if (ParallelRegionBlockSet.count(UserI->getParent()))
1510           Uses.insert(&U);
1511 
1512     // __kmpc_fork_call expects extra arguments as pointers. If the input
1513     // already has a pointer type, everything is fine. Otherwise, store the
1514     // value onto stack and load it back inside the to-be-outlined region. This
1515     // will ensure only the pointer will be passed to the function.
1516     // FIXME: if there are more than 15 trailing arguments, they must be
1517     // additionally packed in a struct.
1518     Value *Inner = &V;
1519     if (!V.getType()->isPointerTy()) {
1520       IRBuilder<>::InsertPointGuard Guard(Builder);
1521       LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1522 
1523       Builder.restoreIP(OuterAllocaIP);
1524       Value *Ptr =
1525           Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1526 
1527       // Store to stack at end of the block that currently branches to the entry
1528       // block of the to-be-outlined region.
1529       Builder.SetInsertPoint(InsertBB,
1530                              InsertBB->getTerminator()->getIterator());
1531       Builder.CreateStore(&V, Ptr);
1532 
1533       // Load back next to allocations in the to-be-outlined region.
1534       Builder.restoreIP(InnerAllocaIP);
1535       Inner = Builder.CreateLoad(V.getType(), Ptr);
1536     }
1537 
1538     Value *ReplacementValue = nullptr;
1539     CallInst *CI = dyn_cast<CallInst>(&V);
1540     if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1541       ReplacementValue = PrivTID;
1542     } else {
1543       Builder.restoreIP(
1544           PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1545       assert(ReplacementValue &&
1546              "Expected copy/create callback to set replacement value!");
1547       if (ReplacementValue == &V)
1548         return;
1549     }
1550 
1551     for (Use *UPtr : Uses)
1552       UPtr->set(ReplacementValue);
1553   };
1554 
1555   // Reset the inner alloca insertion as it will be used for loading the values
1556   // wrapped into pointers before passing them into the to-be-outlined region.
1557   // Configure it to insert immediately after the fake use of zero address so
1558   // that they are available in the generated body and so that the
1559   // OpenMP-related values (thread ID and zero address pointers) remain leading
1560   // in the argument list.
1561   InnerAllocaIP = IRBuilder<>::InsertPoint(
1562       ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1563 
1564   // Reset the outer alloca insertion point to the entry of the relevant block
1565   // in case it was invalidated.
1566   OuterAllocaIP = IRBuilder<>::InsertPoint(
1567       OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1568 
1569   for (Value *Input : Inputs) {
1570     LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1571     PrivHelper(*Input);
1572   }
1573   LLVM_DEBUG({
1574     for (Value *Output : Outputs)
1575       LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1576   });
1577   assert(Outputs.empty() &&
1578          "OpenMP outlining should not produce live-out values!");
1579 
1580   LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
1581   LLVM_DEBUG({
1582     for (auto *BB : Blocks)
1583       dbgs() << " PBR: " << BB->getName() << "\n";
1584   });
1585 
1586   // Register the outlined info.
1587   addOutlineInfo(std::move(OI));
1588 
1589   InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1590   UI->eraseFromParent();
1591 
1592   return AfterIP;
1593 }
1594 
1595 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1596   // Build call void __kmpc_flush(ident_t *loc)
1597   uint32_t SrcLocStrSize;
1598   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1599   Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1600 
1601   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1602 }
1603 
1604 void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1605   if (!updateToLocation(Loc))
1606     return;
1607   emitFlush(Loc);
1608 }
1609 
1610 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1611   // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1612   // global_tid);
1613   uint32_t SrcLocStrSize;
1614   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1615   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1616   Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1617 
1618   // Ignore return result until untied tasks are supported.
1619   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1620                      Args);
1621 }
1622 
1623 void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1624   if (!updateToLocation(Loc))
1625     return;
1626   emitTaskwaitImpl(Loc);
1627 }
1628 
1629 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1630   // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1631   uint32_t SrcLocStrSize;
1632   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1633   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1634   Constant *I32Null = ConstantInt::getNullValue(Int32);
1635   Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1636 
1637   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1638                      Args);
1639 }
1640 
1641 void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1642   if (!updateToLocation(Loc))
1643     return;
1644   emitTaskyieldImpl(Loc);
1645 }
1646 
1647 OpenMPIRBuilder::InsertPointTy
1648 OpenMPIRBuilder::createTask(const LocationDescription &Loc,
1649                             InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1650                             bool Tied, Value *Final, Value *IfCondition,
1651                             SmallVector<DependData> Dependencies) {
1652 
1653   if (!updateToLocation(Loc))
1654     return InsertPointTy();
1655 
1656   uint32_t SrcLocStrSize;
1657   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1658   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1659   // The current basic block is split into four basic blocks. After outlining,
1660   // they will be mapped as follows:
1661   // ```
1662   // def current_fn() {
1663   //   current_basic_block:
1664   //     br label %task.exit
1665   //   task.exit:
1666   //     ; instructions after task
1667   // }
1668   // def outlined_fn() {
1669   //   task.alloca:
1670   //     br label %task.body
1671   //   task.body:
1672   //     ret void
1673   // }
1674   // ```
1675   BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1676   BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1677   BasicBlock *TaskAllocaBB =
1678       splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1679 
1680   InsertPointTy TaskAllocaIP =
1681       InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1682   InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1683   BodyGenCB(TaskAllocaIP, TaskBodyIP);
1684 
1685   OutlineInfo OI;
1686   OI.EntryBB = TaskAllocaBB;
1687   OI.OuterAllocaBB = AllocaIP.getBlock();
1688   OI.ExitBB = TaskExitBB;
1689 
1690   // Add the thread ID argument.
1691   std::stack<Instruction *> ToBeDeleted;
1692   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1693       Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1694 
1695   OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1696                       TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1697     // Replace the Stale CI by appropriate RTL function call.
1698     assert(OutlinedFn.getNumUses() == 1 &&
1699            "there must be a single user for the outlined function");
1700     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1701 
1702     // HasShareds is true if any variables are captured in the outlined region,
1703     // false otherwise.
1704     bool HasShareds = StaleCI->arg_size() > 1;
1705     Builder.SetInsertPoint(StaleCI);
1706 
1707     // Gather the arguments for emitting the runtime call for
1708     // @__kmpc_omp_task_alloc
1709     Function *TaskAllocFn =
1710         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1711 
1712     // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1713     // call.
1714     Value *ThreadID = getOrCreateThreadID(Ident);
1715 
1716     // Argument - `flags`
1717     // Task is tied iff (Flags & 1) == 1.
1718     // Task is untied iff (Flags & 1) == 0.
1719     // Task is final iff (Flags & 2) == 2.
1720     // Task is not final iff (Flags & 2) == 0.
1721     // TODO: Handle the other flags.
1722     Value *Flags = Builder.getInt32(Tied);
1723     if (Final) {
1724       Value *FinalFlag =
1725           Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
1726       Flags = Builder.CreateOr(FinalFlag, Flags);
1727     }
1728 
1729     // Argument - `sizeof_kmp_task_t` (TaskSize)
1730     // Tasksize refers to the size in bytes of kmp_task_t data structure
1731     // including private vars accessed in task.
1732     // TODO: add kmp_task_t_with_privates (privates)
1733     Value *TaskSize = Builder.getInt64(
1734         divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
1735 
1736     // Argument - `sizeof_shareds` (SharedsSize)
1737     // SharedsSize refers to the shareds array size in the kmp_task_t data
1738     // structure.
1739     Value *SharedsSize = Builder.getInt64(0);
1740     if (HasShareds) {
1741       AllocaInst *ArgStructAlloca =
1742           dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1743       assert(ArgStructAlloca &&
1744              "Unable to find the alloca instruction corresponding to arguments "
1745              "for extracted function");
1746       StructType *ArgStructType =
1747           dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1748       assert(ArgStructType && "Unable to find struct type corresponding to "
1749                               "arguments for extracted function");
1750       SharedsSize =
1751           Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
1752     }
1753     // Emit the @__kmpc_omp_task_alloc runtime call
1754     // The runtime call returns a pointer to an area where the task captured
1755     // variables must be copied before the task is run (TaskData)
1756     CallInst *TaskData = Builder.CreateCall(
1757         TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1758                       /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1759                       /*task_func=*/&OutlinedFn});
1760 
1761     // Copy the arguments for outlined function
1762     if (HasShareds) {
1763       Value *Shareds = StaleCI->getArgOperand(1);
1764       Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1765       Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1766       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1767                            SharedsSize);
1768     }
1769 
1770     Value *DepArray = nullptr;
1771     if (Dependencies.size()) {
1772       InsertPointTy OldIP = Builder.saveIP();
1773       Builder.SetInsertPoint(
1774           &OldIP.getBlock()->getParent()->getEntryBlock().back());
1775 
1776       Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1777       DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1778 
1779       unsigned P = 0;
1780       for (const DependData &Dep : Dependencies) {
1781         Value *Base =
1782             Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1783         // Store the pointer to the variable
1784         Value *Addr = Builder.CreateStructGEP(
1785             DependInfo, Base,
1786             static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1787         Value *DepValPtr =
1788             Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1789         Builder.CreateStore(DepValPtr, Addr);
1790         // Store the size of the variable
1791         Value *Size = Builder.CreateStructGEP(
1792             DependInfo, Base,
1793             static_cast<unsigned int>(RTLDependInfoFields::Len));
1794         Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize(
1795                                 Dep.DepValueType)),
1796                             Size);
1797         // Store the dependency kind
1798         Value *Flags = Builder.CreateStructGEP(
1799             DependInfo, Base,
1800             static_cast<unsigned int>(RTLDependInfoFields::Flags));
1801         Builder.CreateStore(
1802             ConstantInt::get(Builder.getInt8Ty(),
1803                              static_cast<unsigned int>(Dep.DepKind)),
1804             Flags);
1805         ++P;
1806       }
1807 
1808       Builder.restoreIP(OldIP);
1809     }
1810 
1811     // In the presence of the `if` clause, the following IR is generated:
1812     //    ...
1813     //    %data = call @__kmpc_omp_task_alloc(...)
1814     //    br i1 %if_condition, label %then, label %else
1815     //  then:
1816     //    call @__kmpc_omp_task(...)
1817     //    br label %exit
1818     //  else:
1819     //    call @__kmpc_omp_task_begin_if0(...)
1820     //    call @outlined_fn(...)
1821     //    call @__kmpc_omp_task_complete_if0(...)
1822     //    br label %exit
1823     //  exit:
1824     //    ...
1825     if (IfCondition) {
1826       // `SplitBlockAndInsertIfThenElse` requires the block to have a
1827       // terminator.
1828       splitBB(Builder, /*CreateBranch=*/true, "if.end");
1829       Instruction *IfTerminator =
1830           Builder.GetInsertPoint()->getParent()->getTerminator();
1831       Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1832       Builder.SetInsertPoint(IfTerminator);
1833       SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1834                                     &ElseTI);
1835       Builder.SetInsertPoint(ElseTI);
1836       Function *TaskBeginFn =
1837           getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1838       Function *TaskCompleteFn =
1839           getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1840       Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1841       CallInst *CI = nullptr;
1842       if (HasShareds)
1843         CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1844       else
1845         CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1846       CI->setDebugLoc(StaleCI->getDebugLoc());
1847       Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1848       Builder.SetInsertPoint(ThenTI);
1849     }
1850 
1851     if (Dependencies.size()) {
1852       Function *TaskFn =
1853           getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1854       Builder.CreateCall(
1855           TaskFn,
1856           {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
1857            DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
1858            ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))});
1859 
1860     } else {
1861       // Emit the @__kmpc_omp_task runtime call to spawn the task
1862       Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1863       Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
1864     }
1865 
1866     StaleCI->eraseFromParent();
1867 
1868     Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
1869     if (HasShareds) {
1870       LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
1871       OutlinedFn.getArg(1)->replaceUsesWithIf(
1872           Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
1873     }
1874 
1875     while (!ToBeDeleted.empty()) {
1876       ToBeDeleted.top()->eraseFromParent();
1877       ToBeDeleted.pop();
1878     }
1879   };
1880 
1881   addOutlineInfo(std::move(OI));
1882   Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
1883 
1884   return Builder.saveIP();
1885 }
1886 
1887 OpenMPIRBuilder::InsertPointTy
1888 OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
1889                                  InsertPointTy AllocaIP,
1890                                  BodyGenCallbackTy BodyGenCB) {
1891   if (!updateToLocation(Loc))
1892     return InsertPointTy();
1893 
1894   uint32_t SrcLocStrSize;
1895   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1896   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1897   Value *ThreadID = getOrCreateThreadID(Ident);
1898 
1899   // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1900   Function *TaskgroupFn =
1901       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
1902   Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
1903 
1904   BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
1905   BodyGenCB(AllocaIP, Builder.saveIP());
1906 
1907   Builder.SetInsertPoint(TaskgroupExitBB);
1908   // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1909   Function *EndTaskgroupFn =
1910       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
1911   Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
1912 
1913   return Builder.saveIP();
1914 }
1915 
1916 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections(
1917     const LocationDescription &Loc, InsertPointTy AllocaIP,
1918     ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
1919     FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1920   assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1921 
1922   if (!updateToLocation(Loc))
1923     return Loc.IP;
1924 
1925   auto FiniCBWrapper = [&](InsertPointTy IP) {
1926     if (IP.getBlock()->end() != IP.getPoint())
1927       return FiniCB(IP);
1928     // This must be done otherwise any nested constructs using FinalizeOMPRegion
1929     // will fail because that function requires the Finalization Basic Block to
1930     // have a terminator, which is already removed by EmitOMPRegionBody.
1931     // IP is currently at cancelation block.
1932     // We need to backtrack to the condition block to fetch
1933     // the exit block and create a branch from cancelation
1934     // to exit block.
1935     IRBuilder<>::InsertPointGuard IPG(Builder);
1936     Builder.restoreIP(IP);
1937     auto *CaseBB = IP.getBlock()->getSinglePredecessor();
1938     auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
1939     auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
1940     Instruction *I = Builder.CreateBr(ExitBB);
1941     IP = InsertPointTy(I->getParent(), I->getIterator());
1942     return FiniCB(IP);
1943   };
1944 
1945   FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
1946 
1947   // Each section is emitted as a switch case
1948   // Each finalization callback is handled from clang.EmitOMPSectionDirective()
1949   // -> OMP.createSection() which generates the IR for each section
1950   // Iterate through all sections and emit a switch construct:
1951   // switch (IV) {
1952   //   case 0:
1953   //     <SectionStmt[0]>;
1954   //     break;
1955   // ...
1956   //   case <NumSection> - 1:
1957   //     <SectionStmt[<NumSection> - 1]>;
1958   //     break;
1959   // }
1960   // ...
1961   // section_loop.after:
1962   // <FiniCB>;
1963   auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
1964     Builder.restoreIP(CodeGenIP);
1965     BasicBlock *Continue =
1966         splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
1967     Function *CurFn = Continue->getParent();
1968     SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
1969 
1970     unsigned CaseNumber = 0;
1971     for (auto SectionCB : SectionCBs) {
1972       BasicBlock *CaseBB = BasicBlock::Create(
1973           M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
1974       SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
1975       Builder.SetInsertPoint(CaseBB);
1976       BranchInst *CaseEndBr = Builder.CreateBr(Continue);
1977       SectionCB(InsertPointTy(),
1978                 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
1979       CaseNumber++;
1980     }
1981     // remove the existing terminator from body BB since there can be no
1982     // terminators after switch/case
1983   };
1984   // Loop body ends here
1985   // LowerBound, UpperBound, and STride for createCanonicalLoop
1986   Type *I32Ty = Type::getInt32Ty(M.getContext());
1987   Value *LB = ConstantInt::get(I32Ty, 0);
1988   Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
1989   Value *ST = ConstantInt::get(I32Ty, 1);
1990   llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop(
1991       Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
1992   InsertPointTy AfterIP =
1993       applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
1994 
1995   // Apply the finalization callback in LoopAfterBB
1996   auto FiniInfo = FinalizationStack.pop_back_val();
1997   assert(FiniInfo.DK == OMPD_sections &&
1998          "Unexpected finalization stack state!");
1999   if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2000     Builder.restoreIP(AfterIP);
2001     BasicBlock *FiniBB =
2002         splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2003     CB(Builder.saveIP());
2004     AfterIP = {FiniBB, FiniBB->begin()};
2005   }
2006 
2007   return AfterIP;
2008 }
2009 
2010 OpenMPIRBuilder::InsertPointTy
2011 OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2012                                BodyGenCallbackTy BodyGenCB,
2013                                FinalizeCallbackTy FiniCB) {
2014   if (!updateToLocation(Loc))
2015     return Loc.IP;
2016 
2017   auto FiniCBWrapper = [&](InsertPointTy IP) {
2018     if (IP.getBlock()->end() != IP.getPoint())
2019       return FiniCB(IP);
2020     // This must be done otherwise any nested constructs using FinalizeOMPRegion
2021     // will fail because that function requires the Finalization Basic Block to
2022     // have a terminator, which is already removed by EmitOMPRegionBody.
2023     // IP is currently at cancelation block.
2024     // We need to backtrack to the condition block to fetch
2025     // the exit block and create a branch from cancelation
2026     // to exit block.
2027     IRBuilder<>::InsertPointGuard IPG(Builder);
2028     Builder.restoreIP(IP);
2029     auto *CaseBB = Loc.IP.getBlock();
2030     auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2031     auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2032     Instruction *I = Builder.CreateBr(ExitBB);
2033     IP = InsertPointTy(I->getParent(), I->getIterator());
2034     return FiniCB(IP);
2035   };
2036 
2037   Directive OMPD = Directive::OMPD_sections;
2038   // Since we are using Finalization Callback here, HasFinalize
2039   // and IsCancellable have to be true
2040   return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2041                               /*Conditional*/ false, /*hasFinalize*/ true,
2042                               /*IsCancellable*/ true);
2043 }
2044 
2045 /// Create a function with a unique name and a "void (i8*, i8*)" signature in
2046 /// the given module and return it.
2047 Function *getFreshReductionFunc(Module &M) {
2048   Type *VoidTy = Type::getVoidTy(M.getContext());
2049   Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
2050   auto *FuncTy =
2051       FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
2052   return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
2053                           M.getDataLayout().getDefaultGlobalsAddressSpace(),
2054                           ".omp.reduction.func", &M);
2055 }
2056 
2057 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
2058     const LocationDescription &Loc, InsertPointTy AllocaIP,
2059     ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
2060   for (const ReductionInfo &RI : ReductionInfos) {
2061     (void)RI;
2062     assert(RI.Variable && "expected non-null variable");
2063     assert(RI.PrivateVariable && "expected non-null private variable");
2064     assert(RI.ReductionGen && "expected non-null reduction generator callback");
2065     assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
2066            "expected variables and their private equivalents to have the same "
2067            "type");
2068     assert(RI.Variable->getType()->isPointerTy() &&
2069            "expected variables to be pointers");
2070   }
2071 
2072   if (!updateToLocation(Loc))
2073     return InsertPointTy();
2074 
2075   BasicBlock *InsertBlock = Loc.IP.getBlock();
2076   BasicBlock *ContinuationBlock =
2077       InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
2078   InsertBlock->getTerminator()->eraseFromParent();
2079 
2080   // Create and populate array of type-erased pointers to private reduction
2081   // values.
2082   unsigned NumReductions = ReductionInfos.size();
2083   Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
2084   Builder.restoreIP(AllocaIP);
2085   Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
2086 
2087   Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
2088 
2089   for (auto En : enumerate(ReductionInfos)) {
2090     unsigned Index = En.index();
2091     const ReductionInfo &RI = En.value();
2092     Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
2093         RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
2094     Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
2095   }
2096 
2097   // Emit a call to the runtime function that orchestrates the reduction.
2098   // Declare the reduction function in the process.
2099   Function *Func = Builder.GetInsertBlock()->getParent();
2100   Module *Module = Func->getParent();
2101   uint32_t SrcLocStrSize;
2102   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2103   bool CanGenerateAtomic =
2104       llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
2105         return RI.AtomicReductionGen;
2106       });
2107   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
2108                                   CanGenerateAtomic
2109                                       ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
2110                                       : IdentFlag(0));
2111   Value *ThreadId = getOrCreateThreadID(Ident);
2112   Constant *NumVariables = Builder.getInt32(NumReductions);
2113   const DataLayout &DL = Module->getDataLayout();
2114   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
2115   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
2116   Function *ReductionFunc = getFreshReductionFunc(*Module);
2117   Value *Lock = getOMPCriticalRegionLock(".reduction");
2118   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
2119       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
2120                : RuntimeFunction::OMPRTL___kmpc_reduce);
2121   CallInst *ReduceCall =
2122       Builder.CreateCall(ReduceFunc,
2123                          {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
2124                           ReductionFunc, Lock},
2125                          "reduce");
2126 
2127   // Create final reduction entry blocks for the atomic and non-atomic case.
2128   // Emit IR that dispatches control flow to one of the blocks based on the
2129   // reduction supporting the atomic mode.
2130   BasicBlock *NonAtomicRedBlock =
2131       BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
2132   BasicBlock *AtomicRedBlock =
2133       BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
2134   SwitchInst *Switch =
2135       Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
2136   Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
2137   Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
2138 
2139   // Populate the non-atomic reduction using the elementwise reduction function.
2140   // This loads the elements from the global and private variables and reduces
2141   // them before storing back the result to the global variable.
2142   Builder.SetInsertPoint(NonAtomicRedBlock);
2143   for (auto En : enumerate(ReductionInfos)) {
2144     const ReductionInfo &RI = En.value();
2145     Type *ValueType = RI.ElementType;
2146     Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
2147                                          "red.value." + Twine(En.index()));
2148     Value *PrivateRedValue =
2149         Builder.CreateLoad(ValueType, RI.PrivateVariable,
2150                            "red.private.value." + Twine(En.index()));
2151     Value *Reduced;
2152     Builder.restoreIP(
2153         RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
2154     if (!Builder.GetInsertBlock())
2155       return InsertPointTy();
2156     Builder.CreateStore(Reduced, RI.Variable);
2157   }
2158   Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
2159       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2160                : RuntimeFunction::OMPRTL___kmpc_end_reduce);
2161   Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
2162   Builder.CreateBr(ContinuationBlock);
2163 
2164   // Populate the atomic reduction using the atomic elementwise reduction
2165   // function. There are no loads/stores here because they will be happening
2166   // inside the atomic elementwise reduction.
2167   Builder.SetInsertPoint(AtomicRedBlock);
2168   if (CanGenerateAtomic) {
2169     for (const ReductionInfo &RI : ReductionInfos) {
2170       Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
2171                                               RI.Variable, RI.PrivateVariable));
2172       if (!Builder.GetInsertBlock())
2173         return InsertPointTy();
2174     }
2175     Builder.CreateBr(ContinuationBlock);
2176   } else {
2177     Builder.CreateUnreachable();
2178   }
2179 
2180   // Populate the outlined reduction function using the elementwise reduction
2181   // function. Partial values are extracted from the type-erased array of
2182   // pointers to private variables.
2183   BasicBlock *ReductionFuncBlock =
2184       BasicBlock::Create(Module->getContext(), "", ReductionFunc);
2185   Builder.SetInsertPoint(ReductionFuncBlock);
2186   Value *LHSArrayPtr = ReductionFunc->getArg(0);
2187   Value *RHSArrayPtr = ReductionFunc->getArg(1);
2188 
2189   for (auto En : enumerate(ReductionInfos)) {
2190     const ReductionInfo &RI = En.value();
2191     Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
2192         RedArrayTy, LHSArrayPtr, 0, En.index());
2193     Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
2194     Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
2195     Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
2196     Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
2197         RedArrayTy, RHSArrayPtr, 0, En.index());
2198     Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
2199     Value *RHSPtr =
2200         Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
2201     Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
2202     Value *Reduced;
2203     Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
2204     if (!Builder.GetInsertBlock())
2205       return InsertPointTy();
2206     Builder.CreateStore(Reduced, LHSPtr);
2207   }
2208   Builder.CreateRetVoid();
2209 
2210   Builder.SetInsertPoint(ContinuationBlock);
2211   return Builder.saveIP();
2212 }
2213 
2214 OpenMPIRBuilder::InsertPointTy
2215 OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
2216                               BodyGenCallbackTy BodyGenCB,
2217                               FinalizeCallbackTy FiniCB) {
2218 
2219   if (!updateToLocation(Loc))
2220     return Loc.IP;
2221 
2222   Directive OMPD = Directive::OMPD_master;
2223   uint32_t SrcLocStrSize;
2224   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2225   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2226   Value *ThreadId = getOrCreateThreadID(Ident);
2227   Value *Args[] = {Ident, ThreadId};
2228 
2229   Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
2230   Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2231 
2232   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
2233   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
2234 
2235   return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2236                               /*Conditional*/ true, /*hasFinalize*/ true);
2237 }
2238 
2239 OpenMPIRBuilder::InsertPointTy
2240 OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
2241                               BodyGenCallbackTy BodyGenCB,
2242                               FinalizeCallbackTy FiniCB, Value *Filter) {
2243   if (!updateToLocation(Loc))
2244     return Loc.IP;
2245 
2246   Directive OMPD = Directive::OMPD_masked;
2247   uint32_t SrcLocStrSize;
2248   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2249   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2250   Value *ThreadId = getOrCreateThreadID(Ident);
2251   Value *Args[] = {Ident, ThreadId, Filter};
2252   Value *ArgsEnd[] = {Ident, ThreadId};
2253 
2254   Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
2255   Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2256 
2257   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
2258   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
2259 
2260   return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2261                               /*Conditional*/ true, /*hasFinalize*/ true);
2262 }
2263 
2264 CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
2265     DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
2266     BasicBlock *PostInsertBefore, const Twine &Name) {
2267   Module *M = F->getParent();
2268   LLVMContext &Ctx = M->getContext();
2269   Type *IndVarTy = TripCount->getType();
2270 
2271   // Create the basic block structure.
2272   BasicBlock *Preheader =
2273       BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
2274   BasicBlock *Header =
2275       BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
2276   BasicBlock *Cond =
2277       BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
2278   BasicBlock *Body =
2279       BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
2280   BasicBlock *Latch =
2281       BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
2282   BasicBlock *Exit =
2283       BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
2284   BasicBlock *After =
2285       BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
2286 
2287   // Use specified DebugLoc for new instructions.
2288   Builder.SetCurrentDebugLocation(DL);
2289 
2290   Builder.SetInsertPoint(Preheader);
2291   Builder.CreateBr(Header);
2292 
2293   Builder.SetInsertPoint(Header);
2294   PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
2295   IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
2296   Builder.CreateBr(Cond);
2297 
2298   Builder.SetInsertPoint(Cond);
2299   Value *Cmp =
2300       Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
2301   Builder.CreateCondBr(Cmp, Body, Exit);
2302 
2303   Builder.SetInsertPoint(Body);
2304   Builder.CreateBr(Latch);
2305 
2306   Builder.SetInsertPoint(Latch);
2307   Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
2308                                   "omp_" + Name + ".next", /*HasNUW=*/true);
2309   Builder.CreateBr(Header);
2310   IndVarPHI->addIncoming(Next, Latch);
2311 
2312   Builder.SetInsertPoint(Exit);
2313   Builder.CreateBr(After);
2314 
2315   // Remember and return the canonical control flow.
2316   LoopInfos.emplace_front();
2317   CanonicalLoopInfo *CL = &LoopInfos.front();
2318 
2319   CL->Header = Header;
2320   CL->Cond = Cond;
2321   CL->Latch = Latch;
2322   CL->Exit = Exit;
2323 
2324 #ifndef NDEBUG
2325   CL->assertOK();
2326 #endif
2327   return CL;
2328 }
2329 
2330 CanonicalLoopInfo *
2331 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
2332                                      LoopBodyGenCallbackTy BodyGenCB,
2333                                      Value *TripCount, const Twine &Name) {
2334   BasicBlock *BB = Loc.IP.getBlock();
2335   BasicBlock *NextBB = BB->getNextNode();
2336 
2337   CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
2338                                              NextBB, NextBB, Name);
2339   BasicBlock *After = CL->getAfter();
2340 
2341   // If location is not set, don't connect the loop.
2342   if (updateToLocation(Loc)) {
2343     // Split the loop at the insertion point: Branch to the preheader and move
2344     // every following instruction to after the loop (the After BB). Also, the
2345     // new successor is the loop's after block.
2346     spliceBB(Builder, After, /*CreateBranch=*/false);
2347     Builder.CreateBr(CL->getPreheader());
2348   }
2349 
2350   // Emit the body content. We do it after connecting the loop to the CFG to
2351   // avoid that the callback encounters degenerate BBs.
2352   BodyGenCB(CL->getBodyIP(), CL->getIndVar());
2353 
2354 #ifndef NDEBUG
2355   CL->assertOK();
2356 #endif
2357   return CL;
2358 }
2359 
2360 CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop(
2361     const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
2362     Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
2363     InsertPointTy ComputeIP, const Twine &Name) {
2364 
2365   // Consider the following difficulties (assuming 8-bit signed integers):
2366   //  * Adding \p Step to the loop counter which passes \p Stop may overflow:
2367   //      DO I = 1, 100, 50
2368   ///  * A \p Step of INT_MIN cannot not be normalized to a positive direction:
2369   //      DO I = 100, 0, -128
2370 
2371   // Start, Stop and Step must be of the same integer type.
2372   auto *IndVarTy = cast<IntegerType>(Start->getType());
2373   assert(IndVarTy == Stop->getType() && "Stop type mismatch");
2374   assert(IndVarTy == Step->getType() && "Step type mismatch");
2375 
2376   LocationDescription ComputeLoc =
2377       ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
2378   updateToLocation(ComputeLoc);
2379 
2380   ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
2381   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
2382 
2383   // Like Step, but always positive.
2384   Value *Incr = Step;
2385 
2386   // Distance between Start and Stop; always positive.
2387   Value *Span;
2388 
2389   // Condition whether there are no iterations are executed at all, e.g. because
2390   // UB < LB.
2391   Value *ZeroCmp;
2392 
2393   if (IsSigned) {
2394     // Ensure that increment is positive. If not, negate and invert LB and UB.
2395     Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
2396     Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
2397     Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
2398     Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
2399     Span = Builder.CreateSub(UB, LB, "", false, true);
2400     ZeroCmp = Builder.CreateICmp(
2401         InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
2402   } else {
2403     Span = Builder.CreateSub(Stop, Start, "", true);
2404     ZeroCmp = Builder.CreateICmp(
2405         InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
2406   }
2407 
2408   Value *CountIfLooping;
2409   if (InclusiveStop) {
2410     CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
2411   } else {
2412     // Avoid incrementing past stop since it could overflow.
2413     Value *CountIfTwo = Builder.CreateAdd(
2414         Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
2415     Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
2416     CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
2417   }
2418   Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
2419                                           "omp_" + Name + ".tripcount");
2420 
2421   auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
2422     Builder.restoreIP(CodeGenIP);
2423     Value *Span = Builder.CreateMul(IV, Step);
2424     Value *IndVar = Builder.CreateAdd(Span, Start);
2425     BodyGenCB(Builder.saveIP(), IndVar);
2426   };
2427   LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
2428   return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
2429 }
2430 
2431 // Returns an LLVM function to call for initializing loop bounds using OpenMP
2432 // static scheduling depending on `type`. Only i32 and i64 are supported by the
2433 // runtime. Always interpret integers as unsigned similarly to
2434 // CanonicalLoopInfo.
2435 static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
2436                                                   OpenMPIRBuilder &OMPBuilder) {
2437   unsigned Bitwidth = Ty->getIntegerBitWidth();
2438   if (Bitwidth == 32)
2439     return OMPBuilder.getOrCreateRuntimeFunction(
2440         M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
2441   if (Bitwidth == 64)
2442     return OMPBuilder.getOrCreateRuntimeFunction(
2443         M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
2444   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2445 }
2446 
2447 OpenMPIRBuilder::InsertPointTy
2448 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
2449                                           InsertPointTy AllocaIP,
2450                                           bool NeedsBarrier) {
2451   assert(CLI->isValid() && "Requires a valid canonical loop");
2452   assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
2453          "Require dedicated allocate IP");
2454 
2455   // Set up the source location value for OpenMP runtime.
2456   Builder.restoreIP(CLI->getPreheaderIP());
2457   Builder.SetCurrentDebugLocation(DL);
2458 
2459   uint32_t SrcLocStrSize;
2460   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2461   Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2462 
2463   // Declare useful OpenMP runtime functions.
2464   Value *IV = CLI->getIndVar();
2465   Type *IVTy = IV->getType();
2466   FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
2467   FunctionCallee StaticFini =
2468       getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2469 
2470   // Allocate space for computed loop bounds as expected by the "init" function.
2471   Builder.restoreIP(AllocaIP);
2472   Type *I32Type = Type::getInt32Ty(M.getContext());
2473   Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2474   Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
2475   Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
2476   Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
2477 
2478   // At the end of the preheader, prepare for calling the "init" function by
2479   // storing the current loop bounds into the allocated space. A canonical loop
2480   // always iterates from 0 to trip-count with step 1. Note that "init" expects
2481   // and produces an inclusive upper bound.
2482   Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2483   Constant *Zero = ConstantInt::get(IVTy, 0);
2484   Constant *One = ConstantInt::get(IVTy, 1);
2485   Builder.CreateStore(Zero, PLowerBound);
2486   Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
2487   Builder.CreateStore(UpperBound, PUpperBound);
2488   Builder.CreateStore(One, PStride);
2489 
2490   Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2491 
2492   Constant *SchedulingType = ConstantInt::get(
2493       I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
2494 
2495   // Call the "init" function and update the trip count of the loop with the
2496   // value it produced.
2497   Builder.CreateCall(StaticInit,
2498                      {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
2499                       PUpperBound, PStride, One, Zero});
2500   Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
2501   Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
2502   Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
2503   Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
2504   CLI->setTripCount(TripCount);
2505 
2506   // Update all uses of the induction variable except the one in the condition
2507   // block that compares it with the actual upper bound, and the increment in
2508   // the latch block.
2509 
2510   CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
2511     Builder.SetInsertPoint(CLI->getBody(),
2512                            CLI->getBody()->getFirstInsertionPt());
2513     Builder.SetCurrentDebugLocation(DL);
2514     return Builder.CreateAdd(OldIV, LowerBound);
2515   });
2516 
2517   // In the "exit" block, call the "fini" function.
2518   Builder.SetInsertPoint(CLI->getExit(),
2519                          CLI->getExit()->getTerminator()->getIterator());
2520   Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2521 
2522   // Add the barrier if requested.
2523   if (NeedsBarrier)
2524     createBarrier(LocationDescription(Builder.saveIP(), DL),
2525                   omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
2526                   /* CheckCancelFlag */ false);
2527 
2528   InsertPointTy AfterIP = CLI->getAfterIP();
2529   CLI->invalidate();
2530 
2531   return AfterIP;
2532 }
2533 
2534 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2535     DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2536     bool NeedsBarrier, Value *ChunkSize) {
2537   assert(CLI->isValid() && "Requires a valid canonical loop");
2538   assert(ChunkSize && "Chunk size is required");
2539 
2540   LLVMContext &Ctx = CLI->getFunction()->getContext();
2541   Value *IV = CLI->getIndVar();
2542   Value *OrigTripCount = CLI->getTripCount();
2543   Type *IVTy = IV->getType();
2544   assert(IVTy->getIntegerBitWidth() <= 64 &&
2545          "Max supported tripcount bitwidth is 64 bits");
2546   Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
2547                                                         : Type::getInt64Ty(Ctx);
2548   Type *I32Type = Type::getInt32Ty(M.getContext());
2549   Constant *Zero = ConstantInt::get(InternalIVTy, 0);
2550   Constant *One = ConstantInt::get(InternalIVTy, 1);
2551 
2552   // Declare useful OpenMP runtime functions.
2553   FunctionCallee StaticInit =
2554       getKmpcForStaticInitForType(InternalIVTy, M, *this);
2555   FunctionCallee StaticFini =
2556       getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2557 
2558   // Allocate space for computed loop bounds as expected by the "init" function.
2559   Builder.restoreIP(AllocaIP);
2560   Builder.SetCurrentDebugLocation(DL);
2561   Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2562   Value *PLowerBound =
2563       Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
2564   Value *PUpperBound =
2565       Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
2566   Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
2567 
2568   // Set up the source location value for the OpenMP runtime.
2569   Builder.restoreIP(CLI->getPreheaderIP());
2570   Builder.SetCurrentDebugLocation(DL);
2571 
2572   // TODO: Detect overflow in ubsan or max-out with current tripcount.
2573   Value *CastedChunkSize =
2574       Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
2575   Value *CastedTripCount =
2576       Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
2577 
2578   Constant *SchedulingType = ConstantInt::get(
2579       I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
2580   Builder.CreateStore(Zero, PLowerBound);
2581   Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
2582   Builder.CreateStore(OrigUpperBound, PUpperBound);
2583   Builder.CreateStore(One, PStride);
2584 
2585   // Call the "init" function and update the trip count of the loop with the
2586   // value it produced.
2587   uint32_t SrcLocStrSize;
2588   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2589   Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2590   Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2591   Builder.CreateCall(StaticInit,
2592                      {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
2593                       /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
2594                       /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
2595                       /*pstride=*/PStride, /*incr=*/One,
2596                       /*chunk=*/CastedChunkSize});
2597 
2598   // Load values written by the "init" function.
2599   Value *FirstChunkStart =
2600       Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
2601   Value *FirstChunkStop =
2602       Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
2603   Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
2604   Value *ChunkRange =
2605       Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
2606   Value *NextChunkStride =
2607       Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
2608 
2609   // Create outer "dispatch" loop for enumerating the chunks.
2610   BasicBlock *DispatchEnter = splitBB(Builder, true);
2611   Value *DispatchCounter;
2612   CanonicalLoopInfo *DispatchCLI = createCanonicalLoop(
2613       {Builder.saveIP(), DL},
2614       [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
2615       FirstChunkStart, CastedTripCount, NextChunkStride,
2616       /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2617       "dispatch");
2618 
2619   // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2620   // not have to preserve the canonical invariant.
2621   BasicBlock *DispatchBody = DispatchCLI->getBody();
2622   BasicBlock *DispatchLatch = DispatchCLI->getLatch();
2623   BasicBlock *DispatchExit = DispatchCLI->getExit();
2624   BasicBlock *DispatchAfter = DispatchCLI->getAfter();
2625   DispatchCLI->invalidate();
2626 
2627   // Rewire the original loop to become the chunk loop inside the dispatch loop.
2628   redirectTo(DispatchAfter, CLI->getAfter(), DL);
2629   redirectTo(CLI->getExit(), DispatchLatch, DL);
2630   redirectTo(DispatchBody, DispatchEnter, DL);
2631 
2632   // Prepare the prolog of the chunk loop.
2633   Builder.restoreIP(CLI->getPreheaderIP());
2634   Builder.SetCurrentDebugLocation(DL);
2635 
2636   // Compute the number of iterations of the chunk loop.
2637   Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2638   Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
2639   Value *IsLastChunk =
2640       Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
2641   Value *CountUntilOrigTripCount =
2642       Builder.CreateSub(CastedTripCount, DispatchCounter);
2643   Value *ChunkTripCount = Builder.CreateSelect(
2644       IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
2645   Value *BackcastedChunkTC =
2646       Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
2647   CLI->setTripCount(BackcastedChunkTC);
2648 
2649   // Update all uses of the induction variable except the one in the condition
2650   // block that compares it with the actual upper bound, and the increment in
2651   // the latch block.
2652   Value *BackcastedDispatchCounter =
2653       Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
2654   CLI->mapIndVar([&](Instruction *) -> Value * {
2655     Builder.restoreIP(CLI->getBodyIP());
2656     return Builder.CreateAdd(IV, BackcastedDispatchCounter);
2657   });
2658 
2659   // In the "exit" block, call the "fini" function.
2660   Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
2661   Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2662 
2663   // Add the barrier if requested.
2664   if (NeedsBarrier)
2665     createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
2666                   /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2667 
2668 #ifndef NDEBUG
2669   // Even though we currently do not support applying additional methods to it,
2670   // the chunk loop should remain a canonical loop.
2671   CLI->assertOK();
2672 #endif
2673 
2674   return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
2675 }
2676 
2677 // Returns an LLVM function to call for executing an OpenMP static worksharing
2678 // for loop depending on `type`. Only i32 and i64 are supported by the runtime.
2679 // Always interpret integers as unsigned similarly to CanonicalLoopInfo.
2680 static FunctionCallee
2681 getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
2682                             WorksharingLoopType LoopType) {
2683   unsigned Bitwidth = Ty->getIntegerBitWidth();
2684   Module &M = OMPBuilder->M;
2685   switch (LoopType) {
2686   case WorksharingLoopType::ForStaticLoop:
2687     if (Bitwidth == 32)
2688       return OMPBuilder->getOrCreateRuntimeFunction(
2689           M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
2690     if (Bitwidth == 64)
2691       return OMPBuilder->getOrCreateRuntimeFunction(
2692           M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
2693     break;
2694   case WorksharingLoopType::DistributeStaticLoop:
2695     if (Bitwidth == 32)
2696       return OMPBuilder->getOrCreateRuntimeFunction(
2697           M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
2698     if (Bitwidth == 64)
2699       return OMPBuilder->getOrCreateRuntimeFunction(
2700           M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
2701     break;
2702   case WorksharingLoopType::DistributeForStaticLoop:
2703     if (Bitwidth == 32)
2704       return OMPBuilder->getOrCreateRuntimeFunction(
2705           M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
2706     if (Bitwidth == 64)
2707       return OMPBuilder->getOrCreateRuntimeFunction(
2708           M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
2709     break;
2710   }
2711   if (Bitwidth != 32 && Bitwidth != 64) {
2712     llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
2713   }
2714   llvm_unreachable("Unknown type of OpenMP worksharing loop");
2715 }
2716 
2717 // Inserts a call to proper OpenMP Device RTL function which handles
2718 // loop worksharing.
2719 static void createTargetLoopWorkshareCall(
2720     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
2721     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
2722     Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
2723   Type *TripCountTy = TripCount->getType();
2724   Module &M = OMPBuilder->M;
2725   IRBuilder<> &Builder = OMPBuilder->Builder;
2726   FunctionCallee RTLFn =
2727       getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
2728   SmallVector<Value *, 8> RealArgs;
2729   RealArgs.push_back(Ident);
2730   RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
2731   RealArgs.push_back(LoopBodyArg);
2732   RealArgs.push_back(TripCount);
2733   if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
2734     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2735     Builder.CreateCall(RTLFn, RealArgs);
2736     return;
2737   }
2738   FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
2739       M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
2740   Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
2741   Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
2742 
2743   RealArgs.push_back(
2744       Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
2745   RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2746   if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
2747     RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2748   }
2749 
2750   Builder.CreateCall(RTLFn, RealArgs);
2751 }
2752 
2753 static void
2754 workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder,
2755                             CanonicalLoopInfo *CLI, Value *Ident,
2756                             Function &OutlinedFn, Type *ParallelTaskPtr,
2757                             const SmallVector<Instruction *, 4> &ToBeDeleted,
2758                             WorksharingLoopType LoopType) {
2759   IRBuilder<> &Builder = OMPIRBuilder->Builder;
2760   BasicBlock *Preheader = CLI->getPreheader();
2761   Value *TripCount = CLI->getTripCount();
2762 
2763   // After loop body outling, the loop body contains only set up
2764   // of loop body argument structure and the call to the outlined
2765   // loop body function. Firstly, we need to move setup of loop body args
2766   // into loop preheader.
2767   Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
2768                     CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
2769 
2770   // The next step is to remove the whole loop. We do not it need anymore.
2771   // That's why make an unconditional branch from loop preheader to loop
2772   // exit block
2773   Builder.restoreIP({Preheader, Preheader->end()});
2774   Preheader->getTerminator()->eraseFromParent();
2775   Builder.CreateBr(CLI->getExit());
2776 
2777   // Delete dead loop blocks
2778   OpenMPIRBuilder::OutlineInfo CleanUpInfo;
2779   SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
2780   SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
2781   CleanUpInfo.EntryBB = CLI->getHeader();
2782   CleanUpInfo.ExitBB = CLI->getExit();
2783   CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
2784   DeleteDeadBlocks(BlocksToBeRemoved);
2785 
2786   // Find the instruction which corresponds to loop body argument structure
2787   // and remove the call to loop body function instruction.
2788   Value *LoopBodyArg;
2789   User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
2790   assert(OutlinedFnUser &&
2791          "Expected unique undroppable user of outlined function");
2792   CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
2793   assert(OutlinedFnCallInstruction && "Expected outlined function call");
2794   assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
2795          "Expected outlined function call to be located in loop preheader");
2796   // Check in case no argument structure has been passed.
2797   if (OutlinedFnCallInstruction->arg_size() > 1)
2798     LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
2799   else
2800     LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
2801   OutlinedFnCallInstruction->eraseFromParent();
2802 
2803   createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
2804                                 LoopBodyArg, ParallelTaskPtr, TripCount,
2805                                 OutlinedFn);
2806 
2807   for (auto &ToBeDeletedItem : ToBeDeleted)
2808     ToBeDeletedItem->eraseFromParent();
2809   CLI->invalidate();
2810 }
2811 
2812 OpenMPIRBuilder::InsertPointTy
2813 OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
2814                                           InsertPointTy AllocaIP,
2815                                           WorksharingLoopType LoopType) {
2816   uint32_t SrcLocStrSize;
2817   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2818   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2819 
2820   OutlineInfo OI;
2821   OI.OuterAllocaBB = CLI->getPreheader();
2822   Function *OuterFn = CLI->getPreheader()->getParent();
2823 
2824   // Instructions which need to be deleted at the end of code generation
2825   SmallVector<Instruction *, 4> ToBeDeleted;
2826 
2827   OI.OuterAllocaBB = AllocaIP.getBlock();
2828 
2829   // Mark the body loop as region which needs to be extracted
2830   OI.EntryBB = CLI->getBody();
2831   OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
2832                                                "omp.prelatch", true);
2833 
2834   // Prepare loop body for extraction
2835   Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
2836 
2837   // Insert new loop counter variable which will be used only in loop
2838   // body.
2839   AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
2840   Instruction *NewLoopCntLoad =
2841       Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
2842   // New loop counter instructions are redundant in the loop preheader when
2843   // code generation for workshare loop is finshed. That's why mark them as
2844   // ready for deletion.
2845   ToBeDeleted.push_back(NewLoopCntLoad);
2846   ToBeDeleted.push_back(NewLoopCnt);
2847 
2848   // Analyse loop body region. Find all input variables which are used inside
2849   // loop body region.
2850   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
2851   SmallVector<BasicBlock *, 32> Blocks;
2852   OI.collectBlocks(ParallelRegionBlockSet, Blocks);
2853   SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
2854                                         ParallelRegionBlockSet.end());
2855 
2856   CodeExtractorAnalysisCache CEAC(*OuterFn);
2857   CodeExtractor Extractor(Blocks,
2858                           /* DominatorTree */ nullptr,
2859                           /* AggregateArgs */ true,
2860                           /* BlockFrequencyInfo */ nullptr,
2861                           /* BranchProbabilityInfo */ nullptr,
2862                           /* AssumptionCache */ nullptr,
2863                           /* AllowVarArgs */ true,
2864                           /* AllowAlloca */ true,
2865                           /* AllocationBlock */ CLI->getPreheader(),
2866                           /* Suffix */ ".omp_wsloop",
2867                           /* AggrArgsIn0AddrSpace */ true);
2868 
2869   BasicBlock *CommonExit = nullptr;
2870   SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
2871 
2872   // Find allocas outside the loop body region which are used inside loop
2873   // body
2874   Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
2875 
2876   // We need to model loop body region as the function f(cnt, loop_arg).
2877   // That's why we replace loop induction variable by the new counter
2878   // which will be one of loop body function argument
2879   SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
2880                             CLI->getIndVar()->user_end());
2881   for (auto Use : Users) {
2882     if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
2883       if (ParallelRegionBlockSet.count(Inst->getParent())) {
2884         Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
2885       }
2886     }
2887   }
2888   // Make sure that loop counter variable is not merged into loop body
2889   // function argument structure and it is passed as separate variable
2890   OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
2891 
2892   // PostOutline CB is invoked when loop body function is outlined and
2893   // loop body is replaced by call to outlined function. We need to add
2894   // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
2895   // function will handle loop control logic.
2896   //
2897   OI.PostOutlineCB = [=, ToBeDeletedVec =
2898                              std::move(ToBeDeleted)](Function &OutlinedFn) {
2899     workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
2900                                 ToBeDeletedVec, LoopType);
2901   };
2902   addOutlineInfo(std::move(OI));
2903   return CLI->getAfterIP();
2904 }
2905 
2906 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop(
2907     DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2908     bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
2909     bool HasSimdModifier, bool HasMonotonicModifier,
2910     bool HasNonmonotonicModifier, bool HasOrderedClause,
2911     WorksharingLoopType LoopType) {
2912   if (Config.isTargetDevice())
2913     return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
2914   OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
2915       SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
2916       HasNonmonotonicModifier, HasOrderedClause);
2917 
2918   bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
2919                    OMPScheduleType::ModifierOrdered;
2920   switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
2921   case OMPScheduleType::BaseStatic:
2922     assert(!ChunkSize && "No chunk size with static-chunked schedule");
2923     if (IsOrdered)
2924       return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2925                                        NeedsBarrier, ChunkSize);
2926     // FIXME: Monotonicity ignored?
2927     return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
2928 
2929   case OMPScheduleType::BaseStaticChunked:
2930     if (IsOrdered)
2931       return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2932                                        NeedsBarrier, ChunkSize);
2933     // FIXME: Monotonicity ignored?
2934     return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
2935                                            ChunkSize);
2936 
2937   case OMPScheduleType::BaseRuntime:
2938   case OMPScheduleType::BaseAuto:
2939   case OMPScheduleType::BaseGreedy:
2940   case OMPScheduleType::BaseBalanced:
2941   case OMPScheduleType::BaseSteal:
2942   case OMPScheduleType::BaseGuidedSimd:
2943   case OMPScheduleType::BaseRuntimeSimd:
2944     assert(!ChunkSize &&
2945            "schedule type does not support user-defined chunk sizes");
2946     [[fallthrough]];
2947   case OMPScheduleType::BaseDynamicChunked:
2948   case OMPScheduleType::BaseGuidedChunked:
2949   case OMPScheduleType::BaseGuidedIterativeChunked:
2950   case OMPScheduleType::BaseGuidedAnalyticalChunked:
2951   case OMPScheduleType::BaseStaticBalancedChunked:
2952     return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
2953                                      NeedsBarrier, ChunkSize);
2954 
2955   default:
2956     llvm_unreachable("Unknown/unimplemented schedule kind");
2957   }
2958 }
2959 
2960 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
2961 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2962 /// the runtime. Always interpret integers as unsigned similarly to
2963 /// CanonicalLoopInfo.
2964 static FunctionCallee
2965 getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
2966   unsigned Bitwidth = Ty->getIntegerBitWidth();
2967   if (Bitwidth == 32)
2968     return OMPBuilder.getOrCreateRuntimeFunction(
2969         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
2970   if (Bitwidth == 64)
2971     return OMPBuilder.getOrCreateRuntimeFunction(
2972         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
2973   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2974 }
2975 
2976 /// Returns an LLVM function to call for updating the next loop using OpenMP
2977 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2978 /// the runtime. Always interpret integers as unsigned similarly to
2979 /// CanonicalLoopInfo.
2980 static FunctionCallee
2981 getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
2982   unsigned Bitwidth = Ty->getIntegerBitWidth();
2983   if (Bitwidth == 32)
2984     return OMPBuilder.getOrCreateRuntimeFunction(
2985         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
2986   if (Bitwidth == 64)
2987     return OMPBuilder.getOrCreateRuntimeFunction(
2988         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
2989   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2990 }
2991 
2992 /// Returns an LLVM function to call for finalizing the dynamic loop using
2993 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always
2994 /// interpret integers as unsigned similarly to CanonicalLoopInfo.
2995 static FunctionCallee
2996 getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
2997   unsigned Bitwidth = Ty->getIntegerBitWidth();
2998   if (Bitwidth == 32)
2999     return OMPBuilder.getOrCreateRuntimeFunction(
3000         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
3001   if (Bitwidth == 64)
3002     return OMPBuilder.getOrCreateRuntimeFunction(
3003         M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
3004   llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3005 }
3006 
3007 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
3008     DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
3009     OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
3010   assert(CLI->isValid() && "Requires a valid canonical loop");
3011   assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3012          "Require dedicated allocate IP");
3013   assert(isValidWorkshareLoopScheduleType(SchedType) &&
3014          "Require valid schedule type");
3015 
3016   bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
3017                  OMPScheduleType::ModifierOrdered;
3018 
3019   // Set up the source location value for OpenMP runtime.
3020   Builder.SetCurrentDebugLocation(DL);
3021 
3022   uint32_t SrcLocStrSize;
3023   Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
3024   Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3025 
3026   // Declare useful OpenMP runtime functions.
3027   Value *IV = CLI->getIndVar();
3028   Type *IVTy = IV->getType();
3029   FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
3030   FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
3031 
3032   // Allocate space for computed loop bounds as expected by the "init" function.
3033   Builder.restoreIP(AllocaIP);
3034   Type *I32Type = Type::getInt32Ty(M.getContext());
3035   Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
3036   Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
3037   Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
3038   Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
3039 
3040   // At the end of the preheader, prepare for calling the "init" function by
3041   // storing the current loop bounds into the allocated space. A canonical loop
3042   // always iterates from 0 to trip-count with step 1. Note that "init" expects
3043   // and produces an inclusive upper bound.
3044   BasicBlock *PreHeader = CLI->getPreheader();
3045   Builder.SetInsertPoint(PreHeader->getTerminator());
3046   Constant *One = ConstantInt::get(IVTy, 1);
3047   Builder.CreateStore(One, PLowerBound);
3048   Value *UpperBound = CLI->getTripCount();
3049   Builder.CreateStore(UpperBound, PUpperBound);
3050   Builder.CreateStore(One, PStride);
3051 
3052   BasicBlock *Header = CLI->getHeader();
3053   BasicBlock *Exit = CLI->getExit();
3054   BasicBlock *Cond = CLI->getCond();
3055   BasicBlock *Latch = CLI->getLatch();
3056   InsertPointTy AfterIP = CLI->getAfterIP();
3057 
3058   // The CLI will be "broken" in the code below, as the loop is no longer
3059   // a valid canonical loop.
3060 
3061   if (!Chunk)
3062     Chunk = One;
3063 
3064   Value *ThreadNum = getOrCreateThreadID(SrcLoc);
3065 
3066   Constant *SchedulingType =
3067       ConstantInt::get(I32Type, static_cast<int>(SchedType));
3068 
3069   // Call the "init" function.
3070   Builder.CreateCall(DynamicInit,
3071                      {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
3072                       UpperBound, /* step */ One, Chunk});
3073 
3074   // An outer loop around the existing one.
3075   BasicBlock *OuterCond = BasicBlock::Create(
3076       PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
3077       PreHeader->getParent());
3078   // This needs to be 32-bit always, so can't use the IVTy Zero above.
3079   Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
3080   Value *Res =
3081       Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
3082                                        PLowerBound, PUpperBound, PStride});
3083   Constant *Zero32 = ConstantInt::get(I32Type, 0);
3084   Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
3085   Value *LowerBound =
3086       Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
3087   Builder.CreateCondBr(MoreWork, Header, Exit);
3088 
3089   // Change PHI-node in loop header to use outer cond rather than preheader,
3090   // and set IV to the LowerBound.
3091   Instruction *Phi = &Header->front();
3092   auto *PI = cast<PHINode>(Phi);
3093   PI->setIncomingBlock(0, OuterCond);
3094   PI->setIncomingValue(0, LowerBound);
3095 
3096   // Then set the pre-header to jump to the OuterCond
3097   Instruction *Term = PreHeader->getTerminator();
3098   auto *Br = cast<BranchInst>(Term);
3099   Br->setSuccessor(0, OuterCond);
3100 
3101   // Modify the inner condition:
3102   // * Use the UpperBound returned from the DynamicNext call.
3103   // * jump to the loop outer loop when done with one of the inner loops.
3104   Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
3105   UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
3106   Instruction *Comp = &*Builder.GetInsertPoint();
3107   auto *CI = cast<CmpInst>(Comp);
3108   CI->setOperand(1, UpperBound);
3109   // Redirect the inner exit to branch to outer condition.
3110   Instruction *Branch = &Cond->back();
3111   auto *BI = cast<BranchInst>(Branch);
3112   assert(BI->getSuccessor(1) == Exit);
3113   BI->setSuccessor(1, OuterCond);
3114 
3115   // Call the "fini" function if "ordered" is present in wsloop directive.
3116   if (Ordered) {
3117     Builder.SetInsertPoint(&Latch->back());
3118     FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
3119     Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
3120   }
3121 
3122   // Add the barrier if requested.
3123   if (NeedsBarrier) {
3124     Builder.SetInsertPoint(&Exit->back());
3125     createBarrier(LocationDescription(Builder.saveIP(), DL),
3126                   omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
3127                   /* CheckCancelFlag */ false);
3128   }
3129 
3130   CLI->invalidate();
3131   return AfterIP;
3132 }
3133 
3134 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
3135 /// after this \p OldTarget will be orphaned.
3136 static void redirectAllPredecessorsTo(BasicBlock *OldTarget,
3137                                       BasicBlock *NewTarget, DebugLoc DL) {
3138   for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
3139     redirectTo(Pred, NewTarget, DL);
3140 }
3141 
3142 /// Determine which blocks in \p BBs are reachable from outside and remove the
3143 /// ones that are not reachable from the function.
3144 static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) {
3145   SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
3146   auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
3147     for (Use &U : BB->uses()) {
3148       auto *UseInst = dyn_cast<Instruction>(U.getUser());
3149       if (!UseInst)
3150         continue;
3151       if (BBsToErase.count(UseInst->getParent()))
3152         continue;
3153       return true;
3154     }
3155     return false;
3156   };
3157 
3158   while (true) {
3159     bool Changed = false;
3160     for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
3161       if (HasRemainingUses(BB)) {
3162         BBsToErase.erase(BB);
3163         Changed = true;
3164       }
3165     }
3166     if (!Changed)
3167       break;
3168   }
3169 
3170   SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
3171   DeleteDeadBlocks(BBVec);
3172 }
3173 
3174 CanonicalLoopInfo *
3175 OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
3176                                InsertPointTy ComputeIP) {
3177   assert(Loops.size() >= 1 && "At least one loop required");
3178   size_t NumLoops = Loops.size();
3179 
3180   // Nothing to do if there is already just one loop.
3181   if (NumLoops == 1)
3182     return Loops.front();
3183 
3184   CanonicalLoopInfo *Outermost = Loops.front();
3185   CanonicalLoopInfo *Innermost = Loops.back();
3186   BasicBlock *OrigPreheader = Outermost->getPreheader();
3187   BasicBlock *OrigAfter = Outermost->getAfter();
3188   Function *F = OrigPreheader->getParent();
3189 
3190   // Loop control blocks that may become orphaned later.
3191   SmallVector<BasicBlock *, 12> OldControlBBs;
3192   OldControlBBs.reserve(6 * Loops.size());
3193   for (CanonicalLoopInfo *Loop : Loops)
3194     Loop->collectControlBlocks(OldControlBBs);
3195 
3196   // Setup the IRBuilder for inserting the trip count computation.
3197   Builder.SetCurrentDebugLocation(DL);
3198   if (ComputeIP.isSet())
3199     Builder.restoreIP(ComputeIP);
3200   else
3201     Builder.restoreIP(Outermost->getPreheaderIP());
3202 
3203   // Derive the collapsed' loop trip count.
3204   // TODO: Find common/largest indvar type.
3205   Value *CollapsedTripCount = nullptr;
3206   for (CanonicalLoopInfo *L : Loops) {
3207     assert(L->isValid() &&
3208            "All loops to collapse must be valid canonical loops");
3209     Value *OrigTripCount = L->getTripCount();
3210     if (!CollapsedTripCount) {
3211       CollapsedTripCount = OrigTripCount;
3212       continue;
3213     }
3214 
3215     // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
3216     CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
3217                                            {}, /*HasNUW=*/true);
3218   }
3219 
3220   // Create the collapsed loop control flow.
3221   CanonicalLoopInfo *Result =
3222       createLoopSkeleton(DL, CollapsedTripCount, F,
3223                          OrigPreheader->getNextNode(), OrigAfter, "collapsed");
3224 
3225   // Build the collapsed loop body code.
3226   // Start with deriving the input loop induction variables from the collapsed
3227   // one, using a divmod scheme. To preserve the original loops' order, the
3228   // innermost loop use the least significant bits.
3229   Builder.restoreIP(Result->getBodyIP());
3230 
3231   Value *Leftover = Result->getIndVar();
3232   SmallVector<Value *> NewIndVars;
3233   NewIndVars.resize(NumLoops);
3234   for (int i = NumLoops - 1; i >= 1; --i) {
3235     Value *OrigTripCount = Loops[i]->getTripCount();
3236 
3237     Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
3238     NewIndVars[i] = NewIndVar;
3239 
3240     Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
3241   }
3242   // Outermost loop gets all the remaining bits.
3243   NewIndVars[0] = Leftover;
3244 
3245   // Construct the loop body control flow.
3246   // We progressively construct the branch structure following in direction of
3247   // the control flow, from the leading in-between code, the loop nest body, the
3248   // trailing in-between code, and rejoining the collapsed loop's latch.
3249   // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
3250   // the ContinueBlock is set, continue with that block. If ContinuePred, use
3251   // its predecessors as sources.
3252   BasicBlock *ContinueBlock = Result->getBody();
3253   BasicBlock *ContinuePred = nullptr;
3254   auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
3255                                                           BasicBlock *NextSrc) {
3256     if (ContinueBlock)
3257       redirectTo(ContinueBlock, Dest, DL);
3258     else
3259       redirectAllPredecessorsTo(ContinuePred, Dest, DL);
3260 
3261     ContinueBlock = nullptr;
3262     ContinuePred = NextSrc;
3263   };
3264 
3265   // The code before the nested loop of each level.
3266   // Because we are sinking it into the nest, it will be executed more often
3267   // that the original loop. More sophisticated schemes could keep track of what
3268   // the in-between code is and instantiate it only once per thread.
3269   for (size_t i = 0; i < NumLoops - 1; ++i)
3270     ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
3271 
3272   // Connect the loop nest body.
3273   ContinueWith(Innermost->getBody(), Innermost->getLatch());
3274 
3275   // The code after the nested loop at each level.
3276   for (size_t i = NumLoops - 1; i > 0; --i)
3277     ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
3278 
3279   // Connect the finished loop to the collapsed loop latch.
3280   ContinueWith(Result->getLatch(), nullptr);
3281 
3282   // Replace the input loops with the new collapsed loop.
3283   redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
3284   redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
3285 
3286   // Replace the input loop indvars with the derived ones.
3287   for (size_t i = 0; i < NumLoops; ++i)
3288     Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
3289 
3290   // Remove unused parts of the input loops.
3291   removeUnusedBlocksFromParent(OldControlBBs);
3292 
3293   for (CanonicalLoopInfo *L : Loops)
3294     L->invalidate();
3295 
3296 #ifndef NDEBUG
3297   Result->assertOK();
3298 #endif
3299   return Result;
3300 }
3301 
3302 std::vector<CanonicalLoopInfo *>
3303 OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
3304                            ArrayRef<Value *> TileSizes) {
3305   assert(TileSizes.size() == Loops.size() &&
3306          "Must pass as many tile sizes as there are loops");
3307   int NumLoops = Loops.size();
3308   assert(NumLoops >= 1 && "At least one loop to tile required");
3309 
3310   CanonicalLoopInfo *OutermostLoop = Loops.front();
3311   CanonicalLoopInfo *InnermostLoop = Loops.back();
3312   Function *F = OutermostLoop->getBody()->getParent();
3313   BasicBlock *InnerEnter = InnermostLoop->getBody();
3314   BasicBlock *InnerLatch = InnermostLoop->getLatch();
3315 
3316   // Loop control blocks that may become orphaned later.
3317   SmallVector<BasicBlock *, 12> OldControlBBs;
3318   OldControlBBs.reserve(6 * Loops.size());
3319   for (CanonicalLoopInfo *Loop : Loops)
3320     Loop->collectControlBlocks(OldControlBBs);
3321 
3322   // Collect original trip counts and induction variable to be accessible by
3323   // index. Also, the structure of the original loops is not preserved during
3324   // the construction of the tiled loops, so do it before we scavenge the BBs of
3325   // any original CanonicalLoopInfo.
3326   SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
3327   for (CanonicalLoopInfo *L : Loops) {
3328     assert(L->isValid() && "All input loops must be valid canonical loops");
3329     OrigTripCounts.push_back(L->getTripCount());
3330     OrigIndVars.push_back(L->getIndVar());
3331   }
3332 
3333   // Collect the code between loop headers. These may contain SSA definitions
3334   // that are used in the loop nest body. To be usable with in the innermost
3335   // body, these BasicBlocks will be sunk into the loop nest body. That is,
3336   // these instructions may be executed more often than before the tiling.
3337   // TODO: It would be sufficient to only sink them into body of the
3338   // corresponding tile loop.
3339   SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> InbetweenCode;
3340   for (int i = 0; i < NumLoops - 1; ++i) {
3341     CanonicalLoopInfo *Surrounding = Loops[i];
3342     CanonicalLoopInfo *Nested = Loops[i + 1];
3343 
3344     BasicBlock *EnterBB = Surrounding->getBody();
3345     BasicBlock *ExitBB = Nested->getHeader();
3346     InbetweenCode.emplace_back(EnterBB, ExitBB);
3347   }
3348 
3349   // Compute the trip counts of the floor loops.
3350   Builder.SetCurrentDebugLocation(DL);
3351   Builder.restoreIP(OutermostLoop->getPreheaderIP());
3352   SmallVector<Value *, 4> FloorCount, FloorRems;
3353   for (int i = 0; i < NumLoops; ++i) {
3354     Value *TileSize = TileSizes[i];
3355     Value *OrigTripCount = OrigTripCounts[i];
3356     Type *IVType = OrigTripCount->getType();
3357 
3358     Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
3359     Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
3360 
3361     // 0 if tripcount divides the tilesize, 1 otherwise.
3362     // 1 means we need an additional iteration for a partial tile.
3363     //
3364     // Unfortunately we cannot just use the roundup-formula
3365     //   (tripcount + tilesize - 1)/tilesize
3366     // because the summation might overflow. We do not want introduce undefined
3367     // behavior when the untiled loop nest did not.
3368     Value *FloorTripOverflow =
3369         Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
3370 
3371     FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
3372     FloorTripCount =
3373         Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
3374                           "omp_floor" + Twine(i) + ".tripcount", true);
3375 
3376     // Remember some values for later use.
3377     FloorCount.push_back(FloorTripCount);
3378     FloorRems.push_back(FloorTripRem);
3379   }
3380 
3381   // Generate the new loop nest, from the outermost to the innermost.
3382   std::vector<CanonicalLoopInfo *> Result;
3383   Result.reserve(NumLoops * 2);
3384 
3385   // The basic block of the surrounding loop that enters the nest generated
3386   // loop.
3387   BasicBlock *Enter = OutermostLoop->getPreheader();
3388 
3389   // The basic block of the surrounding loop where the inner code should
3390   // continue.
3391   BasicBlock *Continue = OutermostLoop->getAfter();
3392 
3393   // Where the next loop basic block should be inserted.
3394   BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
3395 
3396   auto EmbeddNewLoop =
3397       [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
3398           Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
3399     CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
3400         DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
3401     redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
3402     redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
3403 
3404     // Setup the position where the next embedded loop connects to this loop.
3405     Enter = EmbeddedLoop->getBody();
3406     Continue = EmbeddedLoop->getLatch();
3407     OutroInsertBefore = EmbeddedLoop->getLatch();
3408     return EmbeddedLoop;
3409   };
3410 
3411   auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
3412                                                   const Twine &NameBase) {
3413     for (auto P : enumerate(TripCounts)) {
3414       CanonicalLoopInfo *EmbeddedLoop =
3415           EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
3416       Result.push_back(EmbeddedLoop);
3417     }
3418   };
3419 
3420   EmbeddNewLoops(FloorCount, "floor");
3421 
3422   // Within the innermost floor loop, emit the code that computes the tile
3423   // sizes.
3424   Builder.SetInsertPoint(Enter->getTerminator());
3425   SmallVector<Value *, 4> TileCounts;
3426   for (int i = 0; i < NumLoops; ++i) {
3427     CanonicalLoopInfo *FloorLoop = Result[i];
3428     Value *TileSize = TileSizes[i];
3429 
3430     Value *FloorIsEpilogue =
3431         Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
3432     Value *TileTripCount =
3433         Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
3434 
3435     TileCounts.push_back(TileTripCount);
3436   }
3437 
3438   // Create the tile loops.
3439   EmbeddNewLoops(TileCounts, "tile");
3440 
3441   // Insert the inbetween code into the body.
3442   BasicBlock *BodyEnter = Enter;
3443   BasicBlock *BodyEntered = nullptr;
3444   for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
3445     BasicBlock *EnterBB = P.first;
3446     BasicBlock *ExitBB = P.second;
3447 
3448     if (BodyEnter)
3449       redirectTo(BodyEnter, EnterBB, DL);
3450     else
3451       redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
3452 
3453     BodyEnter = nullptr;
3454     BodyEntered = ExitBB;
3455   }
3456 
3457   // Append the original loop nest body into the generated loop nest body.
3458   if (BodyEnter)
3459     redirectTo(BodyEnter, InnerEnter, DL);
3460   else
3461     redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
3462   redirectAllPredecessorsTo(InnerLatch, Continue, DL);
3463 
3464   // Replace the original induction variable with an induction variable computed
3465   // from the tile and floor induction variables.
3466   Builder.restoreIP(Result.back()->getBodyIP());
3467   for (int i = 0; i < NumLoops; ++i) {
3468     CanonicalLoopInfo *FloorLoop = Result[i];
3469     CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
3470     Value *OrigIndVar = OrigIndVars[i];
3471     Value *Size = TileSizes[i];
3472 
3473     Value *Scale =
3474         Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
3475     Value *Shift =
3476         Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
3477     OrigIndVar->replaceAllUsesWith(Shift);
3478   }
3479 
3480   // Remove unused parts of the original loops.
3481   removeUnusedBlocksFromParent(OldControlBBs);
3482 
3483   for (CanonicalLoopInfo *L : Loops)
3484     L->invalidate();
3485 
3486 #ifndef NDEBUG
3487   for (CanonicalLoopInfo *GenL : Result)
3488     GenL->assertOK();
3489 #endif
3490   return Result;
3491 }
3492 
3493 /// Attach metadata \p Properties to the basic block described by \p BB. If the
3494 /// basic block already has metadata, the basic block properties are appended.
3495 static void addBasicBlockMetadata(BasicBlock *BB,
3496                                   ArrayRef<Metadata *> Properties) {
3497   // Nothing to do if no property to attach.
3498   if (Properties.empty())
3499     return;
3500 
3501   LLVMContext &Ctx = BB->getContext();
3502   SmallVector<Metadata *> NewProperties;
3503   NewProperties.push_back(nullptr);
3504 
3505   // If the basic block already has metadata, prepend it to the new metadata.
3506   MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
3507   if (Existing)
3508     append_range(NewProperties, drop_begin(Existing->operands(), 1));
3509 
3510   append_range(NewProperties, Properties);
3511   MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
3512   BasicBlockID->replaceOperandWith(0, BasicBlockID);
3513 
3514   BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
3515 }
3516 
3517 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the
3518 /// loop already has metadata, the loop properties are appended.
3519 static void addLoopMetadata(CanonicalLoopInfo *Loop,
3520                             ArrayRef<Metadata *> Properties) {
3521   assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
3522 
3523   // Attach metadata to the loop's latch
3524   BasicBlock *Latch = Loop->getLatch();
3525   assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
3526   addBasicBlockMetadata(Latch, Properties);
3527 }
3528 
3529 /// Attach llvm.access.group metadata to the memref instructions of \p Block
3530 static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
3531                             LoopInfo &LI) {
3532   for (Instruction &I : *Block) {
3533     if (I.mayReadOrWriteMemory()) {
3534       // TODO: This instruction may already have access group from
3535       // other pragmas e.g. #pragma clang loop vectorize.  Append
3536       // so that the existing metadata is not overwritten.
3537       I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
3538     }
3539   }
3540 }
3541 
3542 void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
3543   LLVMContext &Ctx = Builder.getContext();
3544   addLoopMetadata(
3545       Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3546              MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
3547 }
3548 
3549 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
3550   LLVMContext &Ctx = Builder.getContext();
3551   addLoopMetadata(
3552       Loop, {
3553                 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3554             });
3555 }
3556 
3557 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
3558                                       Value *IfCond, ValueToValueMapTy &VMap,
3559                                       const Twine &NamePrefix) {
3560   Function *F = CanonicalLoop->getFunction();
3561 
3562   // Define where if branch should be inserted
3563   Instruction *SplitBefore;
3564   if (Instruction::classof(IfCond)) {
3565     SplitBefore = dyn_cast<Instruction>(IfCond);
3566   } else {
3567     SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
3568   }
3569 
3570   // TODO: We should not rely on pass manager. Currently we use pass manager
3571   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3572   // object. We should have a method  which returns all blocks between
3573   // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3574   FunctionAnalysisManager FAM;
3575   FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3576   FAM.registerPass([]() { return LoopAnalysis(); });
3577   FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3578 
3579   // Get the loop which needs to be cloned
3580   LoopAnalysis LIA;
3581   LoopInfo &&LI = LIA.run(*F, FAM);
3582   Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3583 
3584   // Create additional blocks for the if statement
3585   BasicBlock *Head = SplitBefore->getParent();
3586   Instruction *HeadOldTerm = Head->getTerminator();
3587   llvm::LLVMContext &C = Head->getContext();
3588   llvm::BasicBlock *ThenBlock = llvm::BasicBlock::Create(
3589       C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
3590   llvm::BasicBlock *ElseBlock = llvm::BasicBlock::Create(
3591       C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
3592 
3593   // Create if condition branch.
3594   Builder.SetInsertPoint(HeadOldTerm);
3595   Instruction *BrInstr =
3596       Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
3597   InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
3598   // Then block contains branch to omp loop which needs to be vectorized
3599   spliceBB(IP, ThenBlock, false);
3600   ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
3601 
3602   Builder.SetInsertPoint(ElseBlock);
3603 
3604   // Clone loop for the else branch
3605   SmallVector<BasicBlock *, 8> NewBlocks;
3606 
3607   VMap[CanonicalLoop->getPreheader()] = ElseBlock;
3608   for (BasicBlock *Block : L->getBlocks()) {
3609     BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
3610     NewBB->moveBefore(CanonicalLoop->getExit());
3611     VMap[Block] = NewBB;
3612     NewBlocks.push_back(NewBB);
3613   }
3614   remapInstructionsInBlocks(NewBlocks, VMap);
3615   Builder.CreateBr(NewBlocks.front());
3616 }
3617 
3618 unsigned
3619 OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
3620                                            const StringMap<bool> &Features) {
3621   if (TargetTriple.isX86()) {
3622     if (Features.lookup("avx512f"))
3623       return 512;
3624     else if (Features.lookup("avx"))
3625       return 256;
3626     return 128;
3627   }
3628   if (TargetTriple.isPPC())
3629     return 128;
3630   if (TargetTriple.isWasm())
3631     return 128;
3632   return 0;
3633 }
3634 
3635 void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
3636                                 MapVector<Value *, Value *> AlignedVars,
3637                                 Value *IfCond, OrderKind Order,
3638                                 ConstantInt *Simdlen, ConstantInt *Safelen) {
3639   LLVMContext &Ctx = Builder.getContext();
3640 
3641   Function *F = CanonicalLoop->getFunction();
3642 
3643   // TODO: We should not rely on pass manager. Currently we use pass manager
3644   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3645   // object. We should have a method  which returns all blocks between
3646   // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3647   FunctionAnalysisManager FAM;
3648   FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3649   FAM.registerPass([]() { return LoopAnalysis(); });
3650   FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3651 
3652   LoopAnalysis LIA;
3653   LoopInfo &&LI = LIA.run(*F, FAM);
3654 
3655   Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3656   if (AlignedVars.size()) {
3657     InsertPointTy IP = Builder.saveIP();
3658     Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
3659     for (auto &AlignedItem : AlignedVars) {
3660       Value *AlignedPtr = AlignedItem.first;
3661       Value *Alignment = AlignedItem.second;
3662       Builder.CreateAlignmentAssumption(F->getParent()->getDataLayout(),
3663                                         AlignedPtr, Alignment);
3664     }
3665     Builder.restoreIP(IP);
3666   }
3667 
3668   if (IfCond) {
3669     ValueToValueMapTy VMap;
3670     createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
3671     // Add metadata to the cloned loop which disables vectorization
3672     Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
3673     assert(MappedLatch &&
3674            "Cannot find value which corresponds to original loop latch");
3675     assert(isa<BasicBlock>(MappedLatch) &&
3676            "Cannot cast mapped latch block value to BasicBlock");
3677     BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
3678     ConstantAsMetadata *BoolConst =
3679         ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx)));
3680     addBasicBlockMetadata(
3681         NewLatchBlock,
3682         {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
3683                            BoolConst})});
3684   }
3685 
3686   SmallSet<BasicBlock *, 8> Reachable;
3687 
3688   // Get the basic blocks from the loop in which memref instructions
3689   // can be found.
3690   // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
3691   // preferably without running any passes.
3692   for (BasicBlock *Block : L->getBlocks()) {
3693     if (Block == CanonicalLoop->getCond() ||
3694         Block == CanonicalLoop->getHeader())
3695       continue;
3696     Reachable.insert(Block);
3697   }
3698 
3699   SmallVector<Metadata *> LoopMDList;
3700 
3701   // In presence of finite 'safelen', it may be unsafe to mark all
3702   // the memory instructions parallel, because loop-carried
3703   // dependences of 'safelen' iterations are possible.
3704   // If clause order(concurrent) is specified then the memory instructions
3705   // are marked parallel even if 'safelen' is finite.
3706   if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
3707     // Add access group metadata to memory-access instructions.
3708     MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
3709     for (BasicBlock *BB : Reachable)
3710       addSimdMetadata(BB, AccessGroup, LI);
3711     // TODO:  If the loop has existing parallel access metadata, have
3712     // to combine two lists.
3713     LoopMDList.push_back(MDNode::get(
3714         Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
3715   }
3716 
3717   // Use the above access group metadata to create loop level
3718   // metadata, which should be distinct for each loop.
3719   ConstantAsMetadata *BoolConst =
3720       ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx)));
3721   LoopMDList.push_back(MDNode::get(
3722       Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
3723 
3724   if (Simdlen || Safelen) {
3725     // If both simdlen and safelen clauses are specified, the value of the
3726     // simdlen parameter must be less than or equal to the value of the safelen
3727     // parameter. Therefore, use safelen only in the absence of simdlen.
3728     ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
3729     LoopMDList.push_back(
3730         MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
3731                           ConstantAsMetadata::get(VectorizeWidth)}));
3732   }
3733 
3734   addLoopMetadata(CanonicalLoop, LoopMDList);
3735 }
3736 
3737 /// Create the TargetMachine object to query the backend for optimization
3738 /// preferences.
3739 ///
3740 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
3741 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when
3742 /// needed for the LLVM pass pipline. We use some default options to avoid
3743 /// having to pass too many settings from the frontend that probably do not
3744 /// matter.
3745 ///
3746 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
3747 /// method. If we are going to use TargetMachine for more purposes, especially
3748 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
3749 /// might become be worth requiring front-ends to pass on their TargetMachine,
3750 /// or at least cache it between methods. Note that while fontends such as Clang
3751 /// have just a single main TargetMachine per translation unit, "target-cpu" and
3752 /// "target-features" that determine the TargetMachine are per-function and can
3753 /// be overrided using __attribute__((target("OPTIONS"))).
3754 static std::unique_ptr<TargetMachine>
3755 createTargetMachine(Function *F, CodeGenOptLevel OptLevel) {
3756   Module *M = F->getParent();
3757 
3758   StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
3759   StringRef Features = F->getFnAttribute("target-features").getValueAsString();
3760   const std::string &Triple = M->getTargetTriple();
3761 
3762   std::string Error;
3763   const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
3764   if (!TheTarget)
3765     return {};
3766 
3767   llvm::TargetOptions Options;
3768   return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
3769       Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
3770       /*CodeModel=*/std::nullopt, OptLevel));
3771 }
3772 
3773 /// Heuristically determine the best-performant unroll factor for \p CLI. This
3774 /// depends on the target processor. We are re-using the same heuristics as the
3775 /// LoopUnrollPass.
3776 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
3777   Function *F = CLI->getFunction();
3778 
3779   // Assume the user requests the most aggressive unrolling, even if the rest of
3780   // the code is optimized using a lower setting.
3781   CodeGenOptLevel OptLevel = CodeGenOptLevel::Aggressive;
3782   std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
3783 
3784   FunctionAnalysisManager FAM;
3785   FAM.registerPass([]() { return TargetLibraryAnalysis(); });
3786   FAM.registerPass([]() { return AssumptionAnalysis(); });
3787   FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3788   FAM.registerPass([]() { return LoopAnalysis(); });
3789   FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
3790   FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3791   TargetIRAnalysis TIRA;
3792   if (TM)
3793     TIRA = TargetIRAnalysis(
3794         [&](const Function &F) { return TM->getTargetTransformInfo(F); });
3795   FAM.registerPass([&]() { return TIRA; });
3796 
3797   TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
3798   ScalarEvolutionAnalysis SEA;
3799   ScalarEvolution &&SE = SEA.run(*F, FAM);
3800   DominatorTreeAnalysis DTA;
3801   DominatorTree &&DT = DTA.run(*F, FAM);
3802   LoopAnalysis LIA;
3803   LoopInfo &&LI = LIA.run(*F, FAM);
3804   AssumptionAnalysis ACT;
3805   AssumptionCache &&AC = ACT.run(*F, FAM);
3806   OptimizationRemarkEmitter ORE{F};
3807 
3808   Loop *L = LI.getLoopFor(CLI->getHeader());
3809   assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
3810 
3811   TargetTransformInfo::UnrollingPreferences UP =
3812       gatherUnrollingPreferences(L, SE, TTI,
3813                                  /*BlockFrequencyInfo=*/nullptr,
3814                                  /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
3815                                  /*UserThreshold=*/std::nullopt,
3816                                  /*UserCount=*/std::nullopt,
3817                                  /*UserAllowPartial=*/true,
3818                                  /*UserAllowRuntime=*/true,
3819                                  /*UserUpperBound=*/std::nullopt,
3820                                  /*UserFullUnrollMaxCount=*/std::nullopt);
3821 
3822   UP.Force = true;
3823 
3824   // Account for additional optimizations taking place before the LoopUnrollPass
3825   // would unroll the loop.
3826   UP.Threshold *= UnrollThresholdFactor;
3827   UP.PartialThreshold *= UnrollThresholdFactor;
3828 
3829   // Use normal unroll factors even if the rest of the code is optimized for
3830   // size.
3831   UP.OptSizeThreshold = UP.Threshold;
3832   UP.PartialOptSizeThreshold = UP.PartialThreshold;
3833 
3834   LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
3835                     << "  Threshold=" << UP.Threshold << "\n"
3836                     << "  PartialThreshold=" << UP.PartialThreshold << "\n"
3837                     << "  OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
3838                     << "  PartialOptSizeThreshold="
3839                     << UP.PartialOptSizeThreshold << "\n");
3840 
3841   // Disable peeling.
3842   TargetTransformInfo::PeelingPreferences PP =
3843       gatherPeelingPreferences(L, SE, TTI,
3844                                /*UserAllowPeeling=*/false,
3845                                /*UserAllowProfileBasedPeeling=*/false,
3846                                /*UnrollingSpecficValues=*/false);
3847 
3848   SmallPtrSet<const Value *, 32> EphValues;
3849   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
3850 
3851   // Assume that reads and writes to stack variables can be eliminated by
3852   // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3853   // size.
3854   for (BasicBlock *BB : L->blocks()) {
3855     for (Instruction &I : *BB) {
3856       Value *Ptr;
3857       if (auto *Load = dyn_cast<LoadInst>(&I)) {
3858         Ptr = Load->getPointerOperand();
3859       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3860         Ptr = Store->getPointerOperand();
3861       } else
3862         continue;
3863 
3864       Ptr = Ptr->stripPointerCasts();
3865 
3866       if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
3867         if (Alloca->getParent() == &F->getEntryBlock())
3868           EphValues.insert(&I);
3869       }
3870     }
3871   }
3872 
3873   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
3874 
3875   // Loop is not unrollable if the loop contains certain instructions.
3876   if (!UCE.canUnroll() || UCE.Convergent) {
3877     LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3878     return 1;
3879   }
3880 
3881   LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
3882                     << "\n");
3883 
3884   // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3885   // be able to use it.
3886   int TripCount = 0;
3887   int MaxTripCount = 0;
3888   bool MaxOrZero = false;
3889   unsigned TripMultiple = 0;
3890 
3891   bool UseUpperBound = false;
3892   computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
3893                      MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
3894                      UseUpperBound);
3895   unsigned Factor = UP.Count;
3896   LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
3897 
3898   // This function returns 1 to signal to not unroll a loop.
3899   if (Factor == 0)
3900     return 1;
3901   return Factor;
3902 }
3903 
3904 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
3905                                         int32_t Factor,
3906                                         CanonicalLoopInfo **UnrolledCLI) {
3907   assert(Factor >= 0 && "Unroll factor must not be negative");
3908 
3909   Function *F = Loop->getFunction();
3910   LLVMContext &Ctx = F->getContext();
3911 
3912   // If the unrolled loop is not used for another loop-associated directive, it
3913   // is sufficient to add metadata for the LoopUnrollPass.
3914   if (!UnrolledCLI) {
3915     SmallVector<Metadata *, 2> LoopMetadata;
3916     LoopMetadata.push_back(
3917         MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
3918 
3919     if (Factor >= 1) {
3920       ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
3921           ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
3922       LoopMetadata.push_back(MDNode::get(
3923           Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
3924     }
3925 
3926     addLoopMetadata(Loop, LoopMetadata);
3927     return;
3928   }
3929 
3930   // Heuristically determine the unroll factor.
3931   if (Factor == 0)
3932     Factor = computeHeuristicUnrollFactor(Loop);
3933 
3934   // No change required with unroll factor 1.
3935   if (Factor == 1) {
3936     *UnrolledCLI = Loop;
3937     return;
3938   }
3939 
3940   assert(Factor >= 2 &&
3941          "unrolling only makes sense with a factor of 2 or larger");
3942 
3943   Type *IndVarTy = Loop->getIndVarType();
3944 
3945   // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
3946   // unroll the inner loop.
3947   Value *FactorVal =
3948       ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
3949                                        /*isSigned=*/false));
3950   std::vector<CanonicalLoopInfo *> LoopNest =
3951       tileLoops(DL, {Loop}, {FactorVal});
3952   assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
3953   *UnrolledCLI = LoopNest[0];
3954   CanonicalLoopInfo *InnerLoop = LoopNest[1];
3955 
3956   // LoopUnrollPass can only fully unroll loops with constant trip count.
3957   // Unroll by the unroll factor with a fallback epilog for the remainder
3958   // iterations if necessary.
3959   ConstantAsMetadata *FactorConst = ConstantAsMetadata::get(
3960       ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
3961   addLoopMetadata(
3962       InnerLoop,
3963       {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3964        MDNode::get(
3965            Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
3966 
3967 #ifndef NDEBUG
3968   (*UnrolledCLI)->assertOK();
3969 #endif
3970 }
3971 
3972 OpenMPIRBuilder::InsertPointTy
3973 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
3974                                    llvm::Value *BufSize, llvm::Value *CpyBuf,
3975                                    llvm::Value *CpyFn, llvm::Value *DidIt) {
3976   if (!updateToLocation(Loc))
3977     return Loc.IP;
3978 
3979   uint32_t SrcLocStrSize;
3980   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3981   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3982   Value *ThreadId = getOrCreateThreadID(Ident);
3983 
3984   llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
3985 
3986   Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
3987 
3988   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
3989   Builder.CreateCall(Fn, Args);
3990 
3991   return Builder.saveIP();
3992 }
3993 
3994 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
3995     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3996     FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
3997 
3998   if (!updateToLocation(Loc))
3999     return Loc.IP;
4000 
4001   // If needed (i.e. not null), initialize `DidIt` with 0
4002   if (DidIt) {
4003     Builder.CreateStore(Builder.getInt32(0), DidIt);
4004   }
4005 
4006   Directive OMPD = Directive::OMPD_single;
4007   uint32_t SrcLocStrSize;
4008   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4009   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4010   Value *ThreadId = getOrCreateThreadID(Ident);
4011   Value *Args[] = {Ident, ThreadId};
4012 
4013   Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
4014   Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4015 
4016   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
4017   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4018 
4019   // generates the following:
4020   // if (__kmpc_single()) {
4021   //		.... single region ...
4022   // 		__kmpc_end_single
4023   // }
4024   // __kmpc_barrier
4025 
4026   EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4027                        /*Conditional*/ true,
4028                        /*hasFinalize*/ true);
4029   if (!IsNowait)
4030     createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
4031                   omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
4032                   /* CheckCancelFlag */ false);
4033   return Builder.saveIP();
4034 }
4035 
4036 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
4037     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4038     FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
4039 
4040   if (!updateToLocation(Loc))
4041     return Loc.IP;
4042 
4043   Directive OMPD = Directive::OMPD_critical;
4044   uint32_t SrcLocStrSize;
4045   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4046   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4047   Value *ThreadId = getOrCreateThreadID(Ident);
4048   Value *LockVar = getOMPCriticalRegionLock(CriticalName);
4049   Value *Args[] = {Ident, ThreadId, LockVar};
4050 
4051   SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
4052   Function *RTFn = nullptr;
4053   if (HintInst) {
4054     // Add Hint to entry Args and create call
4055     EnterArgs.push_back(HintInst);
4056     RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
4057   } else {
4058     RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
4059   }
4060   Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
4061 
4062   Function *ExitRTLFn =
4063       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
4064   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4065 
4066   return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4067                               /*Conditional*/ false, /*hasFinalize*/ true);
4068 }
4069 
4070 OpenMPIRBuilder::InsertPointTy
4071 OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
4072                                      InsertPointTy AllocaIP, unsigned NumLoops,
4073                                      ArrayRef<llvm::Value *> StoreValues,
4074                                      const Twine &Name, bool IsDependSource) {
4075   assert(
4076       llvm::all_of(StoreValues,
4077                    [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
4078       "OpenMP runtime requires depend vec with i64 type");
4079 
4080   if (!updateToLocation(Loc))
4081     return Loc.IP;
4082 
4083   // Allocate space for vector and generate alloc instruction.
4084   auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
4085   Builder.restoreIP(AllocaIP);
4086   AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
4087   ArgsBase->setAlignment(Align(8));
4088   Builder.restoreIP(Loc.IP);
4089 
4090   // Store the index value with offset in depend vector.
4091   for (unsigned I = 0; I < NumLoops; ++I) {
4092     Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
4093         ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
4094     StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
4095     STInst->setAlignment(Align(8));
4096   }
4097 
4098   Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
4099       ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
4100 
4101   uint32_t SrcLocStrSize;
4102   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4103   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4104   Value *ThreadId = getOrCreateThreadID(Ident);
4105   Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
4106 
4107   Function *RTLFn = nullptr;
4108   if (IsDependSource)
4109     RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
4110   else
4111     RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
4112   Builder.CreateCall(RTLFn, Args);
4113 
4114   return Builder.saveIP();
4115 }
4116 
4117 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd(
4118     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4119     FinalizeCallbackTy FiniCB, bool IsThreads) {
4120   if (!updateToLocation(Loc))
4121     return Loc.IP;
4122 
4123   Directive OMPD = Directive::OMPD_ordered;
4124   Instruction *EntryCall = nullptr;
4125   Instruction *ExitCall = nullptr;
4126 
4127   if (IsThreads) {
4128     uint32_t SrcLocStrSize;
4129     Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4130     Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4131     Value *ThreadId = getOrCreateThreadID(Ident);
4132     Value *Args[] = {Ident, ThreadId};
4133 
4134     Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
4135     EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4136 
4137     Function *ExitRTLFn =
4138         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
4139     ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4140   }
4141 
4142   return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4143                               /*Conditional*/ false, /*hasFinalize*/ true);
4144 }
4145 
4146 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
4147     Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
4148     BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
4149     bool HasFinalize, bool IsCancellable) {
4150 
4151   if (HasFinalize)
4152     FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
4153 
4154   // Create inlined region's entry and body blocks, in preparation
4155   // for conditional creation
4156   BasicBlock *EntryBB = Builder.GetInsertBlock();
4157   Instruction *SplitPos = EntryBB->getTerminator();
4158   if (!isa_and_nonnull<BranchInst>(SplitPos))
4159     SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
4160   BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
4161   BasicBlock *FiniBB =
4162       EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
4163 
4164   Builder.SetInsertPoint(EntryBB->getTerminator());
4165   emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
4166 
4167   // generate body
4168   BodyGenCB(/* AllocaIP */ InsertPointTy(),
4169             /* CodeGenIP */ Builder.saveIP());
4170 
4171   // emit exit call and do any needed finalization.
4172   auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
4173   assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
4174          FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
4175          "Unexpected control flow graph state!!");
4176   emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
4177   assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
4178          "Unexpected Control Flow State!");
4179   MergeBlockIntoPredecessor(FiniBB);
4180 
4181   // If we are skipping the region of a non conditional, remove the exit
4182   // block, and clear the builder's insertion point.
4183   assert(SplitPos->getParent() == ExitBB &&
4184          "Unexpected Insertion point location!");
4185   auto merged = MergeBlockIntoPredecessor(ExitBB);
4186   BasicBlock *ExitPredBB = SplitPos->getParent();
4187   auto InsertBB = merged ? ExitPredBB : ExitBB;
4188   if (!isa_and_nonnull<BranchInst>(SplitPos))
4189     SplitPos->eraseFromParent();
4190   Builder.SetInsertPoint(InsertBB);
4191 
4192   return Builder.saveIP();
4193 }
4194 
4195 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
4196     Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
4197   // if nothing to do, Return current insertion point.
4198   if (!Conditional || !EntryCall)
4199     return Builder.saveIP();
4200 
4201   BasicBlock *EntryBB = Builder.GetInsertBlock();
4202   Value *CallBool = Builder.CreateIsNotNull(EntryCall);
4203   auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
4204   auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
4205 
4206   // Emit thenBB and set the Builder's insertion point there for
4207   // body generation next. Place the block after the current block.
4208   Function *CurFn = EntryBB->getParent();
4209   CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
4210 
4211   // Move Entry branch to end of ThenBB, and replace with conditional
4212   // branch (If-stmt)
4213   Instruction *EntryBBTI = EntryBB->getTerminator();
4214   Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
4215   EntryBBTI->removeFromParent();
4216   Builder.SetInsertPoint(UI);
4217   Builder.Insert(EntryBBTI);
4218   UI->eraseFromParent();
4219   Builder.SetInsertPoint(ThenBB->getTerminator());
4220 
4221   // return an insertion point to ExitBB.
4222   return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
4223 }
4224 
4225 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
4226     omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
4227     bool HasFinalize) {
4228 
4229   Builder.restoreIP(FinIP);
4230 
4231   // If there is finalization to do, emit it before the exit call
4232   if (HasFinalize) {
4233     assert(!FinalizationStack.empty() &&
4234            "Unexpected finalization stack state!");
4235 
4236     FinalizationInfo Fi = FinalizationStack.pop_back_val();
4237     assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
4238 
4239     Fi.FiniCB(FinIP);
4240 
4241     BasicBlock *FiniBB = FinIP.getBlock();
4242     Instruction *FiniBBTI = FiniBB->getTerminator();
4243 
4244     // set Builder IP for call creation
4245     Builder.SetInsertPoint(FiniBBTI);
4246   }
4247 
4248   if (!ExitCall)
4249     return Builder.saveIP();
4250 
4251   // place the Exitcall as last instruction before Finalization block terminator
4252   ExitCall->removeFromParent();
4253   Builder.Insert(ExitCall);
4254 
4255   return IRBuilder<>::InsertPoint(ExitCall->getParent(),
4256                                   ExitCall->getIterator());
4257 }
4258 
4259 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
4260     InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
4261     llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
4262   if (!IP.isSet())
4263     return IP;
4264 
4265   IRBuilder<>::InsertPointGuard IPG(Builder);
4266 
4267   // creates the following CFG structure
4268   //	   OMP_Entry : (MasterAddr != PrivateAddr)?
4269   //       F     T
4270   //       |      \
4271   //       |     copin.not.master
4272   //       |      /
4273   //       v     /
4274   //   copyin.not.master.end
4275   //		     |
4276   //         v
4277   //   OMP.Entry.Next
4278 
4279   BasicBlock *OMP_Entry = IP.getBlock();
4280   Function *CurFn = OMP_Entry->getParent();
4281   BasicBlock *CopyBegin =
4282       BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
4283   BasicBlock *CopyEnd = nullptr;
4284 
4285   // If entry block is terminated, split to preserve the branch to following
4286   // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
4287   if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
4288     CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
4289                                          "copyin.not.master.end");
4290     OMP_Entry->getTerminator()->eraseFromParent();
4291   } else {
4292     CopyEnd =
4293         BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
4294   }
4295 
4296   Builder.SetInsertPoint(OMP_Entry);
4297   Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
4298   Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
4299   Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
4300   Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
4301 
4302   Builder.SetInsertPoint(CopyBegin);
4303   if (BranchtoEnd)
4304     Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
4305 
4306   return Builder.saveIP();
4307 }
4308 
4309 CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
4310                                           Value *Size, Value *Allocator,
4311                                           std::string Name) {
4312   IRBuilder<>::InsertPointGuard IPG(Builder);
4313   Builder.restoreIP(Loc.IP);
4314 
4315   uint32_t SrcLocStrSize;
4316   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4317   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4318   Value *ThreadId = getOrCreateThreadID(Ident);
4319   Value *Args[] = {ThreadId, Size, Allocator};
4320 
4321   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
4322 
4323   return Builder.CreateCall(Fn, Args, Name);
4324 }
4325 
4326 CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
4327                                          Value *Addr, Value *Allocator,
4328                                          std::string Name) {
4329   IRBuilder<>::InsertPointGuard IPG(Builder);
4330   Builder.restoreIP(Loc.IP);
4331 
4332   uint32_t SrcLocStrSize;
4333   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4334   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4335   Value *ThreadId = getOrCreateThreadID(Ident);
4336   Value *Args[] = {ThreadId, Addr, Allocator};
4337   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
4338   return Builder.CreateCall(Fn, Args, Name);
4339 }
4340 
4341 CallInst *OpenMPIRBuilder::createOMPInteropInit(
4342     const LocationDescription &Loc, Value *InteropVar,
4343     omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
4344     Value *DependenceAddress, bool HaveNowaitClause) {
4345   IRBuilder<>::InsertPointGuard IPG(Builder);
4346   Builder.restoreIP(Loc.IP);
4347 
4348   uint32_t SrcLocStrSize;
4349   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4350   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4351   Value *ThreadId = getOrCreateThreadID(Ident);
4352   if (Device == nullptr)
4353     Device = ConstantInt::get(Int32, -1);
4354   Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
4355   if (NumDependences == nullptr) {
4356     NumDependences = ConstantInt::get(Int32, 0);
4357     PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4358     DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4359   }
4360   Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4361   Value *Args[] = {
4362       Ident,  ThreadId,       InteropVar,        InteropTypeVal,
4363       Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
4364 
4365   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
4366 
4367   return Builder.CreateCall(Fn, Args);
4368 }
4369 
4370 CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
4371     const LocationDescription &Loc, Value *InteropVar, Value *Device,
4372     Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
4373   IRBuilder<>::InsertPointGuard IPG(Builder);
4374   Builder.restoreIP(Loc.IP);
4375 
4376   uint32_t SrcLocStrSize;
4377   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4378   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4379   Value *ThreadId = getOrCreateThreadID(Ident);
4380   if (Device == nullptr)
4381     Device = ConstantInt::get(Int32, -1);
4382   if (NumDependences == nullptr) {
4383     NumDependences = ConstantInt::get(Int32, 0);
4384     PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4385     DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4386   }
4387   Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4388   Value *Args[] = {
4389       Ident,          ThreadId,          InteropVar,         Device,
4390       NumDependences, DependenceAddress, HaveNowaitClauseVal};
4391 
4392   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
4393 
4394   return Builder.CreateCall(Fn, Args);
4395 }
4396 
4397 CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
4398                                                Value *InteropVar, Value *Device,
4399                                                Value *NumDependences,
4400                                                Value *DependenceAddress,
4401                                                bool HaveNowaitClause) {
4402   IRBuilder<>::InsertPointGuard IPG(Builder);
4403   Builder.restoreIP(Loc.IP);
4404   uint32_t SrcLocStrSize;
4405   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4406   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4407   Value *ThreadId = getOrCreateThreadID(Ident);
4408   if (Device == nullptr)
4409     Device = ConstantInt::get(Int32, -1);
4410   if (NumDependences == nullptr) {
4411     NumDependences = ConstantInt::get(Int32, 0);
4412     PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4413     DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4414   }
4415   Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4416   Value *Args[] = {
4417       Ident,          ThreadId,          InteropVar,         Device,
4418       NumDependences, DependenceAddress, HaveNowaitClauseVal};
4419 
4420   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
4421 
4422   return Builder.CreateCall(Fn, Args);
4423 }
4424 
4425 CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
4426     const LocationDescription &Loc, llvm::Value *Pointer,
4427     llvm::ConstantInt *Size, const llvm::Twine &Name) {
4428   IRBuilder<>::InsertPointGuard IPG(Builder);
4429   Builder.restoreIP(Loc.IP);
4430 
4431   uint32_t SrcLocStrSize;
4432   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4433   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4434   Value *ThreadId = getOrCreateThreadID(Ident);
4435   Constant *ThreadPrivateCache =
4436       getOrCreateInternalVariable(Int8PtrPtr, Name.str());
4437   llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
4438 
4439   Function *Fn =
4440       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
4441 
4442   return Builder.CreateCall(Fn, Args);
4443 }
4444 
4445 OpenMPIRBuilder::InsertPointTy
4446 OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
4447                                   int32_t MinThreadsVal, int32_t MaxThreadsVal,
4448                                   int32_t MinTeamsVal, int32_t MaxTeamsVal) {
4449   if (!updateToLocation(Loc))
4450     return Loc.IP;
4451 
4452   uint32_t SrcLocStrSize;
4453   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4454   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4455   Constant *IsSPMDVal = ConstantInt::getSigned(
4456       Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
4457   Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
4458   Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
4459   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
4460 
4461   Function *Kernel = Builder.GetInsertBlock()->getParent();
4462 
4463   // Manifest the launch configuration in the metadata matching the kernel
4464   // environment.
4465   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
4466     writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
4467 
4468   // For max values, < 0 means unset, == 0 means set but unknown.
4469   if (MaxThreadsVal < 0)
4470     MaxThreadsVal = std::max(
4471         int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
4472 
4473   if (MaxThreadsVal > 0)
4474     writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
4475 
4476   Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
4477   Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
4478   Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
4479   Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
4480   Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
4481   Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
4482 
4483   // We need to strip the debug prefix to get the correct kernel name.
4484   StringRef KernelName = Kernel->getName();
4485   const std::string DebugPrefix = "_debug__";
4486   if (KernelName.ends_with(DebugPrefix))
4487     KernelName = KernelName.drop_back(DebugPrefix.length());
4488 
4489   Function *Fn = getOrCreateRuntimeFunctionPtr(
4490       omp::RuntimeFunction::OMPRTL___kmpc_target_init);
4491   const DataLayout &DL = Fn->getParent()->getDataLayout();
4492 
4493   Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
4494   Constant *DynamicEnvironmentInitializer =
4495       ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
4496   GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
4497       M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
4498       DynamicEnvironmentInitializer, DynamicEnvironmentName,
4499       /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4500       DL.getDefaultGlobalsAddressSpace());
4501   DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4502 
4503   Constant *DynamicEnvironment =
4504       DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
4505           ? DynamicEnvironmentGV
4506           : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
4507                                            DynamicEnvironmentPtr);
4508 
4509   Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
4510       ConfigurationEnvironment, {
4511                                     UseGenericStateMachineVal,
4512                                     MayUseNestedParallelismVal,
4513                                     IsSPMDVal,
4514                                     MinThreads,
4515                                     MaxThreads,
4516                                     MinTeams,
4517                                     MaxTeams,
4518                                     ReductionDataSize,
4519                                     ReductionBufferLength,
4520                                 });
4521   Constant *KernelEnvironmentInitializer = ConstantStruct::get(
4522       KernelEnvironment, {
4523                              ConfigurationEnvironmentInitializer,
4524                              Ident,
4525                              DynamicEnvironment,
4526                          });
4527   Twine KernelEnvironmentName = KernelName + "_kernel_environment";
4528   GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
4529       M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
4530       KernelEnvironmentInitializer, KernelEnvironmentName,
4531       /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4532       DL.getDefaultGlobalsAddressSpace());
4533   KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4534 
4535   Constant *KernelEnvironment =
4536       KernelEnvironmentGV->getType() == KernelEnvironmentPtr
4537           ? KernelEnvironmentGV
4538           : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
4539                                            KernelEnvironmentPtr);
4540   Value *KernelLaunchEnvironment = Kernel->getArg(0);
4541   CallInst *ThreadKind =
4542       Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
4543 
4544   Value *ExecUserCode = Builder.CreateICmpEQ(
4545       ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
4546       "exec_user_code");
4547 
4548   // ThreadKind = __kmpc_target_init(...)
4549   // if (ThreadKind == -1)
4550   //   user_code
4551   // else
4552   //   return;
4553 
4554   auto *UI = Builder.CreateUnreachable();
4555   BasicBlock *CheckBB = UI->getParent();
4556   BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
4557 
4558   BasicBlock *WorkerExitBB = BasicBlock::Create(
4559       CheckBB->getContext(), "worker.exit", CheckBB->getParent());
4560   Builder.SetInsertPoint(WorkerExitBB);
4561   Builder.CreateRetVoid();
4562 
4563   auto *CheckBBTI = CheckBB->getTerminator();
4564   Builder.SetInsertPoint(CheckBBTI);
4565   Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
4566 
4567   CheckBBTI->eraseFromParent();
4568   UI->eraseFromParent();
4569 
4570   // Continue in the "user_code" block, see diagram above and in
4571   // openmp/libomptarget/deviceRTLs/common/include/target.h .
4572   return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
4573 }
4574 
4575 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
4576                                          int32_t TeamsReductionDataSize,
4577                                          int32_t TeamsReductionBufferLength) {
4578   if (!updateToLocation(Loc))
4579     return;
4580 
4581   Function *Fn = getOrCreateRuntimeFunctionPtr(
4582       omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
4583 
4584   Builder.CreateCall(Fn, {});
4585 
4586   if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
4587     return;
4588 
4589   Function *Kernel = Builder.GetInsertBlock()->getParent();
4590   // We need to strip the debug prefix to get the correct kernel name.
4591   StringRef KernelName = Kernel->getName();
4592   const std::string DebugPrefix = "_debug__";
4593   if (KernelName.ends_with(DebugPrefix))
4594     KernelName = KernelName.drop_back(DebugPrefix.length());
4595   auto *KernelEnvironmentGV =
4596       M.getNamedGlobal((KernelName + "_kernel_environment").str());
4597   assert(KernelEnvironmentGV && "Expected kernel environment global\n");
4598   auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
4599   auto *NewInitializer = ConstantFoldInsertValueInstruction(
4600       KernelEnvironmentInitializer,
4601       ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
4602   NewInitializer = ConstantFoldInsertValueInstruction(
4603       NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
4604       {0, 8});
4605   KernelEnvironmentGV->setInitializer(NewInitializer);
4606 }
4607 
4608 static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
4609   Module &M = *Kernel.getParent();
4610   NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4611   for (auto *Op : MD->operands()) {
4612     if (Op->getNumOperands() != 3)
4613       continue;
4614     auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
4615     if (!KernelOp || KernelOp->getValue() != &Kernel)
4616       continue;
4617     auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
4618     if (!Prop || Prop->getString() != Name)
4619       continue;
4620     return Op;
4621   }
4622   return nullptr;
4623 }
4624 
4625 static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
4626                                 bool Min) {
4627   // Update the "maxntidx" metadata for NVIDIA, or add it.
4628   MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
4629   if (ExistingOp) {
4630     auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4631     int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4632     ExistingOp->replaceOperandWith(
4633         2, ConstantAsMetadata::get(ConstantInt::get(
4634                OldVal->getValue()->getType(),
4635                Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
4636   } else {
4637     LLVMContext &Ctx = Kernel.getContext();
4638     Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
4639                           MDString::get(Ctx, Name),
4640                           ConstantAsMetadata::get(
4641                               ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
4642     // Append metadata to nvvm.annotations
4643     Module &M = *Kernel.getParent();
4644     NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4645     MD->addOperand(MDNode::get(Ctx, MDVals));
4646   }
4647 }
4648 
4649 std::pair<int32_t, int32_t>
4650 OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
4651   int32_t ThreadLimit =
4652       Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
4653 
4654   if (T.isAMDGPU()) {
4655     const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
4656     if (!Attr.isValid() || !Attr.isStringAttribute())
4657       return {0, ThreadLimit};
4658     auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
4659     int32_t LB, UB;
4660     if (!llvm::to_integer(UBStr, UB, 10))
4661       return {0, ThreadLimit};
4662     UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
4663     if (!llvm::to_integer(LBStr, LB, 10))
4664       return {0, UB};
4665     return {LB, UB};
4666   }
4667 
4668   if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
4669     auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4670     int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4671     return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
4672   }
4673   return {0, ThreadLimit};
4674 }
4675 
4676 void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
4677                                                  Function &Kernel, int32_t LB,
4678                                                  int32_t UB) {
4679   Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
4680 
4681   if (T.isAMDGPU()) {
4682     Kernel.addFnAttr("amdgpu-flat-work-group-size",
4683                      llvm::utostr(LB) + "," + llvm::utostr(UB));
4684     return;
4685   }
4686 
4687   updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
4688 }
4689 
4690 std::pair<int32_t, int32_t>
4691 OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
4692   // TODO: Read from backend annotations if available.
4693   return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
4694 }
4695 
4696 void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
4697                                           int32_t LB, int32_t UB) {
4698   if (T.isNVPTX()) {
4699     if (UB > 0)
4700       updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
4701     updateNVPTXMetadata(Kernel, "minctasm", LB, false);
4702   }
4703   Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
4704 }
4705 
4706 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4707     Function *OutlinedFn) {
4708   if (Config.isTargetDevice()) {
4709     OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage);
4710     // TODO: Determine if DSO local can be set to true.
4711     OutlinedFn->setDSOLocal(false);
4712     OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
4713     if (T.isAMDGCN())
4714       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
4715   }
4716 }
4717 
4718 Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
4719                                                     StringRef EntryFnIDName) {
4720   if (Config.isTargetDevice()) {
4721     assert(OutlinedFn && "The outlined function must exist if embedded");
4722     return OutlinedFn;
4723   }
4724 
4725   return new GlobalVariable(
4726       M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
4727       Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
4728 }
4729 
4730 Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
4731                                                        StringRef EntryFnName) {
4732   if (OutlinedFn)
4733     return OutlinedFn;
4734 
4735   assert(!M.getGlobalVariable(EntryFnName, true) &&
4736          "Named kernel already exists?");
4737   return new GlobalVariable(
4738       M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
4739       Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
4740 }
4741 
4742 void OpenMPIRBuilder::emitTargetRegionFunction(
4743     TargetRegionEntryInfo &EntryInfo,
4744     FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
4745     Function *&OutlinedFn, Constant *&OutlinedFnID) {
4746 
4747   SmallString<64> EntryFnName;
4748   OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
4749 
4750   OutlinedFn = Config.isTargetDevice() || !Config.openMPOffloadMandatory()
4751                    ? GenerateFunctionCallback(EntryFnName)
4752                    : nullptr;
4753 
4754   // If this target outline function is not an offload entry, we don't need to
4755   // register it. This may be in the case of a false if clause, or if there are
4756   // no OpenMP targets.
4757   if (!IsOffloadEntry)
4758     return;
4759 
4760   std::string EntryFnIDName =
4761       Config.isTargetDevice()
4762           ? std::string(EntryFnName)
4763           : createPlatformSpecificName({EntryFnName, "region_id"});
4764 
4765   OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
4766                                               EntryFnName, EntryFnIDName);
4767 }
4768 
4769 Constant *OpenMPIRBuilder::registerTargetRegionFunction(
4770     TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
4771     StringRef EntryFnName, StringRef EntryFnIDName) {
4772   if (OutlinedFn)
4773     setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
4774   auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
4775   auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
4776   OffloadInfoManager.registerTargetRegionEntryInfo(
4777       EntryInfo, EntryAddr, OutlinedFnID,
4778       OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
4779   return OutlinedFnID;
4780 }
4781 
4782 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData(
4783     const LocationDescription &Loc, InsertPointTy AllocaIP,
4784     InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
4785     TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
4786     omp::RuntimeFunction *MapperFunc,
4787     function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
4788         BodyGenCB,
4789     function_ref<void(unsigned int, Value *)> DeviceAddrCB,
4790     function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
4791   if (!updateToLocation(Loc))
4792     return InsertPointTy();
4793 
4794   // Disable TargetData CodeGen on Device pass.
4795   if (Config.IsTargetDevice.value_or(false))
4796     return Builder.saveIP();
4797 
4798   Builder.restoreIP(CodeGenIP);
4799   bool IsStandAlone = !BodyGenCB;
4800   MapInfosTy *MapInfo;
4801   // Generate the code for the opening of the data environment. Capture all the
4802   // arguments of the runtime call by reference because they are used in the
4803   // closing of the region.
4804   auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4805     MapInfo = &GenMapInfoCB(Builder.saveIP());
4806     emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
4807                          /*IsNonContiguous=*/true, DeviceAddrCB,
4808                          CustomMapperCB);
4809 
4810     TargetDataRTArgs RTArgs;
4811     emitOffloadingArraysArgument(Builder, RTArgs, Info,
4812                                  !MapInfo->Names.empty());
4813 
4814     // Emit the number of elements in the offloading arrays.
4815     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4816 
4817     // Source location for the ident struct
4818     if (!SrcLocInfo) {
4819       uint32_t SrcLocStrSize;
4820       Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4821       SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4822     }
4823 
4824     Value *OffloadingArgs[] = {SrcLocInfo,           DeviceID,
4825                                PointerNum,           RTArgs.BasePointersArray,
4826                                RTArgs.PointersArray, RTArgs.SizesArray,
4827                                RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4828                                RTArgs.MappersArray};
4829 
4830     if (IsStandAlone) {
4831       assert(MapperFunc && "MapperFunc missing for standalone target data");
4832       Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
4833                          OffloadingArgs);
4834     } else {
4835       Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
4836           omp::OMPRTL___tgt_target_data_begin_mapper);
4837 
4838       Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
4839 
4840       for (auto DeviceMap : Info.DevicePtrInfoMap) {
4841         if (isa<AllocaInst>(DeviceMap.second.second)) {
4842           auto *LI =
4843               Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
4844           Builder.CreateStore(LI, DeviceMap.second.second);
4845         }
4846       }
4847 
4848       // If device pointer privatization is required, emit the body of the
4849       // region here. It will have to be duplicated: with and without
4850       // privatization.
4851       Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::Priv));
4852     }
4853   };
4854 
4855   // If we need device pointer privatization, we need to emit the body of the
4856   // region with no privatization in the 'else' branch of the conditional.
4857   // Otherwise, we don't have to do anything.
4858   auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4859     Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv));
4860   };
4861 
4862   // Generate code for the closing of the data region.
4863   auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4864     TargetDataRTArgs RTArgs;
4865     emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
4866                                  /*ForEndCall=*/true);
4867 
4868     // Emit the number of elements in the offloading arrays.
4869     Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4870 
4871     // Source location for the ident struct
4872     if (!SrcLocInfo) {
4873       uint32_t SrcLocStrSize;
4874       Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4875       SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4876     }
4877 
4878     Value *OffloadingArgs[] = {SrcLocInfo,           DeviceID,
4879                                PointerNum,           RTArgs.BasePointersArray,
4880                                RTArgs.PointersArray, RTArgs.SizesArray,
4881                                RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4882                                RTArgs.MappersArray};
4883     Function *EndMapperFunc =
4884         getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
4885 
4886     Builder.CreateCall(EndMapperFunc, OffloadingArgs);
4887   };
4888 
4889   // We don't have to do anything to close the region if the if clause evaluates
4890   // to false.
4891   auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
4892 
4893   if (BodyGenCB) {
4894     if (IfCond) {
4895       emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
4896     } else {
4897       BeginThenGen(AllocaIP, Builder.saveIP());
4898     }
4899 
4900     // If we don't require privatization of device pointers, we emit the body in
4901     // between the runtime calls. This avoids duplicating the body code.
4902     Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv));
4903 
4904     if (IfCond) {
4905       emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
4906     } else {
4907       EndThenGen(AllocaIP, Builder.saveIP());
4908     }
4909   } else {
4910     if (IfCond) {
4911       emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
4912     } else {
4913       BeginThenGen(AllocaIP, Builder.saveIP());
4914     }
4915   }
4916 
4917   return Builder.saveIP();
4918 }
4919 
4920 FunctionCallee
4921 OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
4922                                              bool IsGPUDistribute) {
4923   assert((IVSize == 32 || IVSize == 64) &&
4924          "IV size is not compatible with the omp runtime");
4925   RuntimeFunction Name;
4926   if (IsGPUDistribute)
4927     Name = IVSize == 32
4928                ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
4929                            : omp::OMPRTL___kmpc_distribute_static_init_4u)
4930                : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
4931                            : omp::OMPRTL___kmpc_distribute_static_init_8u);
4932   else
4933     Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
4934                                     : omp::OMPRTL___kmpc_for_static_init_4u)
4935                         : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
4936                                     : omp::OMPRTL___kmpc_for_static_init_8u);
4937 
4938   return getOrCreateRuntimeFunction(M, Name);
4939 }
4940 
4941 FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
4942                                                            bool IVSigned) {
4943   assert((IVSize == 32 || IVSize == 64) &&
4944          "IV size is not compatible with the omp runtime");
4945   RuntimeFunction Name = IVSize == 32
4946                              ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
4947                                          : omp::OMPRTL___kmpc_dispatch_init_4u)
4948                              : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
4949                                          : omp::OMPRTL___kmpc_dispatch_init_8u);
4950 
4951   return getOrCreateRuntimeFunction(M, Name);
4952 }
4953 
4954 FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
4955                                                            bool IVSigned) {
4956   assert((IVSize == 32 || IVSize == 64) &&
4957          "IV size is not compatible with the omp runtime");
4958   RuntimeFunction Name = IVSize == 32
4959                              ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
4960                                          : omp::OMPRTL___kmpc_dispatch_next_4u)
4961                              : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
4962                                          : omp::OMPRTL___kmpc_dispatch_next_8u);
4963 
4964   return getOrCreateRuntimeFunction(M, Name);
4965 }
4966 
4967 FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
4968                                                            bool IVSigned) {
4969   assert((IVSize == 32 || IVSize == 64) &&
4970          "IV size is not compatible with the omp runtime");
4971   RuntimeFunction Name = IVSize == 32
4972                              ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
4973                                          : omp::OMPRTL___kmpc_dispatch_fini_4u)
4974                              : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
4975                                          : omp::OMPRTL___kmpc_dispatch_fini_8u);
4976 
4977   return getOrCreateRuntimeFunction(M, Name);
4978 }
4979 
4980 static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr,
4981                                                   Function *Func) {
4982   for (User *User : make_early_inc_range(ConstExpr->users()))
4983     if (auto *Instr = dyn_cast<Instruction>(User))
4984       if (Instr->getFunction() == Func)
4985         Instr->replaceUsesOfWith(ConstExpr, ConstExpr->getAsInstruction(Instr));
4986 }
4987 
4988 static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input,
4989                                                     Function *Func) {
4990   for (User *User : make_early_inc_range(Input->users()))
4991     if (auto *Const = dyn_cast<Constant>(User))
4992       if (auto *ConstExpr = dyn_cast<ConstantExpr>(Const))
4993         replaceConstatExprUsesInFuncWithInstr(ConstExpr, Func);
4994 }
4995 
4996 static Function *createOutlinedFunction(
4997     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
4998     SmallVectorImpl<Value *> &Inputs,
4999     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
5000     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
5001   SmallVector<Type *> ParameterTypes;
5002   if (OMPBuilder.Config.isTargetDevice()) {
5003     // Add the "implicit" runtime argument we use to provide launch specific
5004     // information for target devices.
5005     auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
5006     ParameterTypes.push_back(Int8PtrTy);
5007 
5008     // All parameters to target devices are passed as pointers
5009     // or i64. This assumes 64-bit address spaces/pointers.
5010     for (auto &Arg : Inputs)
5011       ParameterTypes.push_back(Arg->getType()->isPointerTy()
5012                                    ? Arg->getType()
5013                                    : Type::getInt64Ty(Builder.getContext()));
5014   } else {
5015     for (auto &Arg : Inputs)
5016       ParameterTypes.push_back(Arg->getType());
5017   }
5018 
5019   auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
5020                                     /*isVarArg*/ false);
5021   auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
5022                                Builder.GetInsertBlock()->getModule());
5023 
5024   // Save insert point.
5025   auto OldInsertPoint = Builder.saveIP();
5026 
5027   // Generate the region into the function.
5028   BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
5029   Builder.SetInsertPoint(EntryBB);
5030 
5031   // Insert target init call in the device compilation pass.
5032   if (OMPBuilder.Config.isTargetDevice())
5033     Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
5034 
5035   BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
5036 
5037   // Insert target deinit call in the device compilation pass.
5038   Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
5039   if (OMPBuilder.Config.isTargetDevice())
5040     OMPBuilder.createTargetDeinit(Builder);
5041 
5042   // Insert return instruction.
5043   Builder.CreateRetVoid();
5044 
5045   // New Alloca IP at entry point of created device function.
5046   Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
5047   auto AllocaIP = Builder.saveIP();
5048 
5049   Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
5050 
5051   // Skip the artificial dyn_ptr on the device.
5052   const auto &ArgRange =
5053       OMPBuilder.Config.isTargetDevice()
5054           ? make_range(Func->arg_begin() + 1, Func->arg_end())
5055           : Func->args();
5056 
5057   // Rewrite uses of input valus to parameters.
5058   for (auto InArg : zip(Inputs, ArgRange)) {
5059     Value *Input = std::get<0>(InArg);
5060     Argument &Arg = std::get<1>(InArg);
5061     Value *InputCopy = nullptr;
5062 
5063     Builder.restoreIP(
5064         ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
5065 
5066     // Things like GEP's can come in the form of Constants. Constants and
5067     // ConstantExpr's do not have access to the knowledge of what they're
5068     // contained in, so we must dig a little to find an instruction so we can
5069     // tell if they're used inside of the function we're outlining. We also
5070     // replace the original constant expression with a new instruction
5071     // equivalent; an instruction as it allows easy modification in the
5072     // following loop, as we can now know the constant (instruction) is owned by
5073     // our target function and replaceUsesOfWith can now be invoked on it
5074     // (cannot do this with constants it seems). A brand new one also allows us
5075     // to be cautious as it is perhaps possible the old expression was used
5076     // inside of the function but exists and is used externally (unlikely by the
5077     // nature of a Constant, but still).
5078     replaceConstantValueUsesInFuncWithInstr(Input, Func);
5079 
5080     // Collect all the instructions
5081     for (User *User : make_early_inc_range(Input->users()))
5082       if (auto *Instr = dyn_cast<Instruction>(User))
5083         if (Instr->getFunction() == Func)
5084           Instr->replaceUsesOfWith(Input, InputCopy);
5085   }
5086 
5087   // Restore insert point.
5088   Builder.restoreIP(OldInsertPoint);
5089 
5090   return Func;
5091 }
5092 
5093 static void emitTargetOutlinedFunction(
5094     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5095     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
5096     Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
5097     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
5098     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
5099 
5100   OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
5101       [&OMPBuilder, &Builder, &Inputs, &CBFunc,
5102        &ArgAccessorFuncCB](StringRef EntryFnName) {
5103         return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
5104                                       CBFunc, ArgAccessorFuncCB);
5105       };
5106 
5107   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
5108                                       OutlinedFn, OutlinedFnID);
5109 }
5110 
5111 static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5112                            OpenMPIRBuilder::InsertPointTy AllocaIP,
5113                            Function *OutlinedFn, Constant *OutlinedFnID,
5114                            int32_t NumTeams, int32_t NumThreads,
5115                            SmallVectorImpl<Value *> &Args,
5116                            OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) {
5117 
5118   OpenMPIRBuilder::TargetDataInfo Info(
5119       /*RequiresDevicePointerInfo=*/false,
5120       /*SeparateBeginEndCalls=*/true);
5121 
5122   OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
5123   OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
5124                                   /*IsNonContiguous=*/true);
5125 
5126   OpenMPIRBuilder::TargetDataRTArgs RTArgs;
5127   OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
5128                                           !MapInfo.Names.empty());
5129 
5130   //  emitKernelLaunch
5131   auto &&EmitTargetCallFallbackCB =
5132       [&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy {
5133     Builder.restoreIP(IP);
5134     Builder.CreateCall(OutlinedFn, Args);
5135     return Builder.saveIP();
5136   };
5137 
5138   unsigned NumTargetItems = MapInfo.BasePointers.size();
5139   // TODO: Use correct device ID
5140   Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
5141   Value *NumTeamsVal = Builder.getInt32(NumTeams);
5142   Value *NumThreadsVal = Builder.getInt32(NumThreads);
5143   uint32_t SrcLocStrSize;
5144   Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
5145   Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
5146                                              llvm::omp::IdentFlag(0), 0);
5147   // TODO: Use correct NumIterations
5148   Value *NumIterations = Builder.getInt64(0);
5149   // TODO: Use correct DynCGGroupMem
5150   Value *DynCGGroupMem = Builder.getInt32(0);
5151 
5152   bool HasNoWait = false;
5153 
5154   OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
5155                                           NumTeamsVal, NumThreadsVal,
5156                                           DynCGGroupMem, HasNoWait);
5157 
5158   Builder.restoreIP(OMPBuilder.emitKernelLaunch(
5159       Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
5160       DeviceID, RTLoc, AllocaIP));
5161 }
5162 
5163 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
5164     const LocationDescription &Loc, InsertPointTy AllocaIP,
5165     InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
5166     int32_t NumThreads, SmallVectorImpl<Value *> &Args,
5167     GenMapInfoCallbackTy GenMapInfoCB,
5168     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
5169     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
5170   if (!updateToLocation(Loc))
5171     return InsertPointTy();
5172 
5173   Builder.restoreIP(CodeGenIP);
5174 
5175   Function *OutlinedFn;
5176   Constant *OutlinedFnID;
5177   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
5178                              OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
5179   if (!Config.isTargetDevice())
5180     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
5181                    NumThreads, Args, GenMapInfoCB);
5182 
5183   return Builder.saveIP();
5184 }
5185 
5186 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
5187                                                    StringRef FirstSeparator,
5188                                                    StringRef Separator) {
5189   SmallString<128> Buffer;
5190   llvm::raw_svector_ostream OS(Buffer);
5191   StringRef Sep = FirstSeparator;
5192   for (StringRef Part : Parts) {
5193     OS << Sep << Part;
5194     Sep = Separator;
5195   }
5196   return OS.str().str();
5197 }
5198 
5199 std::string
5200 OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
5201   return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
5202                                                 Config.separator());
5203 }
5204 
5205 GlobalVariable *
5206 OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
5207                                              unsigned AddressSpace) {
5208   auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
5209   if (Elem.second) {
5210     assert(Elem.second->getValueType() == Ty &&
5211            "OMP internal variable has different type than requested");
5212   } else {
5213     // TODO: investigate the appropriate linkage type used for the global
5214     // variable for possibly changing that to internal or private, or maybe
5215     // create different versions of the function for different OMP internal
5216     // variables.
5217     auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
5218                        ? GlobalValue::ExternalLinkage
5219                        : GlobalValue::CommonLinkage;
5220     auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
5221                                   Constant::getNullValue(Ty), Elem.first(),
5222                                   /*InsertBefore=*/nullptr,
5223                                   GlobalValue::NotThreadLocal, AddressSpace);
5224     const DataLayout &DL = M.getDataLayout();
5225     const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
5226     const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
5227     GV->setAlignment(std::max(TypeAlign, PtrAlign));
5228     Elem.second = GV;
5229   }
5230 
5231   return Elem.second;
5232 }
5233 
5234 Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
5235   std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
5236   std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
5237   return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
5238 }
5239 
5240 Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
5241   LLVMContext &Ctx = Builder.getContext();
5242   Value *Null =
5243       Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
5244   Value *SizeGep =
5245       Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
5246   Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
5247   return SizePtrToInt;
5248 }
5249 
5250 GlobalVariable *
5251 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
5252                                        std::string VarName) {
5253   llvm::Constant *MaptypesArrayInit =
5254       llvm::ConstantDataArray::get(M.getContext(), Mappings);
5255   auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
5256       M, MaptypesArrayInit->getType(),
5257       /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
5258       VarName);
5259   MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
5260   return MaptypesArrayGlobal;
5261 }
5262 
5263 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
5264                                           InsertPointTy AllocaIP,
5265                                           unsigned NumOperands,
5266                                           struct MapperAllocas &MapperAllocas) {
5267   if (!updateToLocation(Loc))
5268     return;
5269 
5270   auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5271   auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5272   Builder.restoreIP(AllocaIP);
5273   AllocaInst *ArgsBase = Builder.CreateAlloca(
5274       ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
5275   AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
5276                                           ".offload_ptrs");
5277   AllocaInst *ArgSizes = Builder.CreateAlloca(
5278       ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
5279   Builder.restoreIP(Loc.IP);
5280   MapperAllocas.ArgsBase = ArgsBase;
5281   MapperAllocas.Args = Args;
5282   MapperAllocas.ArgSizes = ArgSizes;
5283 }
5284 
5285 void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
5286                                      Function *MapperFunc, Value *SrcLocInfo,
5287                                      Value *MaptypesArg, Value *MapnamesArg,
5288                                      struct MapperAllocas &MapperAllocas,
5289                                      int64_t DeviceID, unsigned NumOperands) {
5290   if (!updateToLocation(Loc))
5291     return;
5292 
5293   auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5294   auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5295   Value *ArgsBaseGEP =
5296       Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
5297                                 {Builder.getInt32(0), Builder.getInt32(0)});
5298   Value *ArgsGEP =
5299       Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
5300                                 {Builder.getInt32(0), Builder.getInt32(0)});
5301   Value *ArgSizesGEP =
5302       Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
5303                                 {Builder.getInt32(0), Builder.getInt32(0)});
5304   Value *NullPtr =
5305       Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
5306   Builder.CreateCall(MapperFunc,
5307                      {SrcLocInfo, Builder.getInt64(DeviceID),
5308                       Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
5309                       ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
5310 }
5311 
5312 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
5313                                                    TargetDataRTArgs &RTArgs,
5314                                                    TargetDataInfo &Info,
5315                                                    bool EmitDebug,
5316                                                    bool ForEndCall) {
5317   assert((!ForEndCall || Info.separateBeginEndCalls()) &&
5318          "expected region end call to runtime only when end call is separate");
5319   auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
5320   auto VoidPtrTy = UnqualPtrTy;
5321   auto VoidPtrPtrTy = UnqualPtrTy;
5322   auto Int64Ty = Type::getInt64Ty(M.getContext());
5323   auto Int64PtrTy = UnqualPtrTy;
5324 
5325   if (!Info.NumberOfPtrs) {
5326     RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5327     RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5328     RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
5329     RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
5330     RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5331     RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5332     return;
5333   }
5334 
5335   RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
5336       ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
5337       Info.RTArgs.BasePointersArray,
5338       /*Idx0=*/0, /*Idx1=*/0);
5339   RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
5340       ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
5341       /*Idx0=*/0,
5342       /*Idx1=*/0);
5343   RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
5344       ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5345       /*Idx0=*/0, /*Idx1=*/0);
5346   RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
5347       ArrayType::get(Int64Ty, Info.NumberOfPtrs),
5348       ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
5349                                                  : Info.RTArgs.MapTypesArray,
5350       /*Idx0=*/0,
5351       /*Idx1=*/0);
5352 
5353   // Only emit the mapper information arrays if debug information is
5354   // requested.
5355   if (!EmitDebug)
5356     RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5357   else
5358     RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
5359         ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
5360         /*Idx0=*/0,
5361         /*Idx1=*/0);
5362   // If there is no user-defined mapper, set the mapper array to nullptr to
5363   // avoid an unnecessary data privatization
5364   if (!Info.HasMapper)
5365     RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5366   else
5367     RTArgs.MappersArray =
5368         Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
5369 }
5370 
5371 void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
5372                                                   InsertPointTy CodeGenIP,
5373                                                   MapInfosTy &CombinedInfo,
5374                                                   TargetDataInfo &Info) {
5375   MapInfosTy::StructNonContiguousInfo &NonContigInfo =
5376       CombinedInfo.NonContigInfo;
5377 
5378   // Build an array of struct descriptor_dim and then assign it to
5379   // offload_args.
5380   //
5381   // struct descriptor_dim {
5382   //  uint64_t offset;
5383   //  uint64_t count;
5384   //  uint64_t stride
5385   // };
5386   Type *Int64Ty = Builder.getInt64Ty();
5387   StructType *DimTy = StructType::create(
5388       M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
5389       "struct.descriptor_dim");
5390 
5391   enum { OffsetFD = 0, CountFD, StrideFD };
5392   // We need two index variable here since the size of "Dims" is the same as
5393   // the size of Components, however, the size of offset, count, and stride is
5394   // equal to the size of base declaration that is non-contiguous.
5395   for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
5396     // Skip emitting ir if dimension size is 1 since it cannot be
5397     // non-contiguous.
5398     if (NonContigInfo.Dims[I] == 1)
5399       continue;
5400     Builder.restoreIP(AllocaIP);
5401     ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
5402     AllocaInst *DimsAddr =
5403         Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
5404     Builder.restoreIP(CodeGenIP);
5405     for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
5406       unsigned RevIdx = EE - II - 1;
5407       Value *DimsLVal = Builder.CreateInBoundsGEP(
5408           DimsAddr->getAllocatedType(), DimsAddr,
5409           {Builder.getInt64(0), Builder.getInt64(II)});
5410       // Offset
5411       Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
5412       Builder.CreateAlignedStore(
5413           NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
5414           M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
5415       // Count
5416       Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
5417       Builder.CreateAlignedStore(
5418           NonContigInfo.Counts[L][RevIdx], CountLVal,
5419           M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5420       // Stride
5421       Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
5422       Builder.CreateAlignedStore(
5423           NonContigInfo.Strides[L][RevIdx], StrideLVal,
5424           M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5425     }
5426     // args[I] = &dims
5427     Builder.restoreIP(CodeGenIP);
5428     Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
5429         DimsAddr, Builder.getPtrTy());
5430     Value *P = Builder.CreateConstInBoundsGEP2_32(
5431         ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
5432         Info.RTArgs.PointersArray, 0, I);
5433     Builder.CreateAlignedStore(
5434         DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
5435     ++L;
5436   }
5437 }
5438 
5439 void OpenMPIRBuilder::emitOffloadingArrays(
5440     InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
5441     TargetDataInfo &Info, bool IsNonContiguous,
5442     function_ref<void(unsigned int, Value *)> DeviceAddrCB,
5443     function_ref<Value *(unsigned int)> CustomMapperCB) {
5444 
5445   // Reset the array information.
5446   Info.clearArrayInfo();
5447   Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
5448 
5449   if (Info.NumberOfPtrs == 0)
5450     return;
5451 
5452   Builder.restoreIP(AllocaIP);
5453   // Detect if we have any capture size requiring runtime evaluation of the
5454   // size so that a constant array could be eventually used.
5455   ArrayType *PointerArrayType =
5456       ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
5457 
5458   Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
5459       PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
5460 
5461   Info.RTArgs.PointersArray = Builder.CreateAlloca(
5462       PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
5463   AllocaInst *MappersArray = Builder.CreateAlloca(
5464       PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
5465   Info.RTArgs.MappersArray = MappersArray;
5466 
5467   // If we don't have any VLA types or other types that require runtime
5468   // evaluation, we can use a constant array for the map sizes, otherwise we
5469   // need to fill up the arrays as we do for the pointers.
5470   Type *Int64Ty = Builder.getInt64Ty();
5471   SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
5472                                      ConstantInt::get(Int64Ty, 0));
5473   SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
5474   for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
5475     if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
5476       if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
5477         if (IsNonContiguous &&
5478             static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5479                 CombinedInfo.Types[I] &
5480                 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
5481           ConstSizes[I] =
5482               ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
5483         else
5484           ConstSizes[I] = CI;
5485         continue;
5486       }
5487     }
5488     RuntimeSizes.set(I);
5489   }
5490 
5491   if (RuntimeSizes.all()) {
5492     ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5493     Info.RTArgs.SizesArray = Builder.CreateAlloca(
5494         SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5495     Builder.restoreIP(CodeGenIP);
5496   } else {
5497     auto *SizesArrayInit = ConstantArray::get(
5498         ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
5499     std::string Name = createPlatformSpecificName({"offload_sizes"});
5500     auto *SizesArrayGbl =
5501         new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
5502                            GlobalValue::PrivateLinkage, SizesArrayInit, Name);
5503     SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
5504 
5505     if (!RuntimeSizes.any()) {
5506       Info.RTArgs.SizesArray = SizesArrayGbl;
5507     } else {
5508       unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5509       Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
5510       ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5511       AllocaInst *Buffer = Builder.CreateAlloca(
5512           SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5513       Buffer->setAlignment(OffloadSizeAlign);
5514       Builder.restoreIP(CodeGenIP);
5515       Builder.CreateMemCpy(
5516           Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
5517           SizesArrayGbl, OffloadSizeAlign,
5518           Builder.getIntN(
5519               IndexSize,
5520               Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
5521 
5522       Info.RTArgs.SizesArray = Buffer;
5523     }
5524     Builder.restoreIP(CodeGenIP);
5525   }
5526 
5527   // The map types are always constant so we don't need to generate code to
5528   // fill arrays. Instead, we create an array constant.
5529   SmallVector<uint64_t, 4> Mapping;
5530   for (auto mapFlag : CombinedInfo.Types)
5531     Mapping.push_back(
5532         static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5533             mapFlag));
5534   std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
5535   auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5536   Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
5537 
5538   // The information types are only built if provided.
5539   if (!CombinedInfo.Names.empty()) {
5540     std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
5541     auto *MapNamesArrayGbl =
5542         createOffloadMapnames(CombinedInfo.Names, MapnamesName);
5543     Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
5544   } else {
5545     Info.RTArgs.MapNamesArray =
5546         Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
5547   }
5548 
5549   // If there's a present map type modifier, it must not be applied to the end
5550   // of a region, so generate a separate map type array in that case.
5551   if (Info.separateBeginEndCalls()) {
5552     bool EndMapTypesDiffer = false;
5553     for (uint64_t &Type : Mapping) {
5554       if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5555                      OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
5556         Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5557             OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
5558         EndMapTypesDiffer = true;
5559       }
5560     }
5561     if (EndMapTypesDiffer) {
5562       MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5563       Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
5564     }
5565   }
5566 
5567   PointerType *PtrTy = Builder.getPtrTy();
5568   for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
5569     Value *BPVal = CombinedInfo.BasePointers[I];
5570     Value *BP = Builder.CreateConstInBoundsGEP2_32(
5571         ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
5572         0, I);
5573     Builder.CreateAlignedStore(BPVal, BP,
5574                                M.getDataLayout().getPrefTypeAlign(PtrTy));
5575 
5576     if (Info.requiresDevicePointerInfo()) {
5577       if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
5578         CodeGenIP = Builder.saveIP();
5579         Builder.restoreIP(AllocaIP);
5580         Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
5581         Builder.restoreIP(CodeGenIP);
5582         if (DeviceAddrCB)
5583           DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
5584       } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
5585         Info.DevicePtrInfoMap[BPVal] = {BP, BP};
5586         if (DeviceAddrCB)
5587           DeviceAddrCB(I, BP);
5588       }
5589     }
5590 
5591     Value *PVal = CombinedInfo.Pointers[I];
5592     Value *P = Builder.CreateConstInBoundsGEP2_32(
5593         ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
5594         I);
5595     // TODO: Check alignment correct.
5596     Builder.CreateAlignedStore(PVal, P,
5597                                M.getDataLayout().getPrefTypeAlign(PtrTy));
5598 
5599     if (RuntimeSizes.test(I)) {
5600       Value *S = Builder.CreateConstInBoundsGEP2_32(
5601           ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5602           /*Idx0=*/0,
5603           /*Idx1=*/I);
5604       Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
5605                                                        Int64Ty,
5606                                                        /*isSigned=*/true),
5607                                  S, M.getDataLayout().getPrefTypeAlign(PtrTy));
5608     }
5609     // Fill up the mapper array.
5610     unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5611     Value *MFunc = ConstantPointerNull::get(PtrTy);
5612     if (CustomMapperCB)
5613       if (Value *CustomMFunc = CustomMapperCB(I))
5614         MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
5615     Value *MAddr = Builder.CreateInBoundsGEP(
5616         MappersArray->getAllocatedType(), MappersArray,
5617         {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
5618     Builder.CreateAlignedStore(
5619         MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
5620   }
5621 
5622   if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
5623       Info.NumberOfPtrs == 0)
5624     return;
5625   emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
5626 }
5627 
5628 void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
5629   BasicBlock *CurBB = Builder.GetInsertBlock();
5630 
5631   if (!CurBB || CurBB->getTerminator()) {
5632     // If there is no insert point or the previous block is already
5633     // terminated, don't touch it.
5634   } else {
5635     // Otherwise, create a fall-through branch.
5636     Builder.CreateBr(Target);
5637   }
5638 
5639   Builder.ClearInsertionPoint();
5640 }
5641 
5642 void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
5643                                 bool IsFinished) {
5644   BasicBlock *CurBB = Builder.GetInsertBlock();
5645 
5646   // Fall out of the current block (if necessary).
5647   emitBranch(BB);
5648 
5649   if (IsFinished && BB->use_empty()) {
5650     BB->eraseFromParent();
5651     return;
5652   }
5653 
5654   // Place the block after the current block, if possible, or else at
5655   // the end of the function.
5656   if (CurBB && CurBB->getParent())
5657     CurFn->insert(std::next(CurBB->getIterator()), BB);
5658   else
5659     CurFn->insert(CurFn->end(), BB);
5660   Builder.SetInsertPoint(BB);
5661 }
5662 
5663 void OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
5664                                    BodyGenCallbackTy ElseGen,
5665                                    InsertPointTy AllocaIP) {
5666   // If the condition constant folds and can be elided, try to avoid emitting
5667   // the condition and the dead arm of the if/else.
5668   if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
5669     auto CondConstant = CI->getSExtValue();
5670     if (CondConstant)
5671       ThenGen(AllocaIP, Builder.saveIP());
5672     else
5673       ElseGen(AllocaIP, Builder.saveIP());
5674     return;
5675   }
5676 
5677   Function *CurFn = Builder.GetInsertBlock()->getParent();
5678 
5679   // Otherwise, the condition did not fold, or we couldn't elide it.  Just
5680   // emit the conditional branch.
5681   BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
5682   BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
5683   BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
5684   Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
5685   // Emit the 'then' code.
5686   emitBlock(ThenBlock, CurFn);
5687   ThenGen(AllocaIP, Builder.saveIP());
5688   emitBranch(ContBlock);
5689   // Emit the 'else' code if present.
5690   // There is no need to emit line number for unconditional branch.
5691   emitBlock(ElseBlock, CurFn);
5692   ElseGen(AllocaIP, Builder.saveIP());
5693   // There is no need to emit line number for unconditional branch.
5694   emitBranch(ContBlock);
5695   // Emit the continuation block for code after the if.
5696   emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
5697 }
5698 
5699 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
5700     const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
5701   assert(!(AO == AtomicOrdering::NotAtomic ||
5702            AO == llvm::AtomicOrdering::Unordered) &&
5703          "Unexpected Atomic Ordering.");
5704 
5705   bool Flush = false;
5706   llvm::AtomicOrdering FlushAO = AtomicOrdering::Monotonic;
5707 
5708   switch (AK) {
5709   case Read:
5710     if (AO == AtomicOrdering::Acquire || AO == AtomicOrdering::AcquireRelease ||
5711         AO == AtomicOrdering::SequentiallyConsistent) {
5712       FlushAO = AtomicOrdering::Acquire;
5713       Flush = true;
5714     }
5715     break;
5716   case Write:
5717   case Compare:
5718   case Update:
5719     if (AO == AtomicOrdering::Release || AO == AtomicOrdering::AcquireRelease ||
5720         AO == AtomicOrdering::SequentiallyConsistent) {
5721       FlushAO = AtomicOrdering::Release;
5722       Flush = true;
5723     }
5724     break;
5725   case Capture:
5726     switch (AO) {
5727     case AtomicOrdering::Acquire:
5728       FlushAO = AtomicOrdering::Acquire;
5729       Flush = true;
5730       break;
5731     case AtomicOrdering::Release:
5732       FlushAO = AtomicOrdering::Release;
5733       Flush = true;
5734       break;
5735     case AtomicOrdering::AcquireRelease:
5736     case AtomicOrdering::SequentiallyConsistent:
5737       FlushAO = AtomicOrdering::AcquireRelease;
5738       Flush = true;
5739       break;
5740     default:
5741       // do nothing - leave silently.
5742       break;
5743     }
5744   }
5745 
5746   if (Flush) {
5747     // Currently Flush RT call still doesn't take memory_ordering, so for when
5748     // that happens, this tries to do the resolution of which atomic ordering
5749     // to use with but issue the flush call
5750     // TODO: pass `FlushAO` after memory ordering support is added
5751     (void)FlushAO;
5752     emitFlush(Loc);
5753   }
5754 
5755   // for AO == AtomicOrdering::Monotonic and  all other case combinations
5756   // do nothing
5757   return Flush;
5758 }
5759 
5760 OpenMPIRBuilder::InsertPointTy
5761 OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
5762                                   AtomicOpValue &X, AtomicOpValue &V,
5763                                   AtomicOrdering AO) {
5764   if (!updateToLocation(Loc))
5765     return Loc.IP;
5766 
5767   assert(X.Var->getType()->isPointerTy() &&
5768          "OMP Atomic expects a pointer to target memory");
5769   Type *XElemTy = X.ElemTy;
5770   assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5771           XElemTy->isPointerTy()) &&
5772          "OMP atomic read expected a scalar type");
5773 
5774   Value *XRead = nullptr;
5775 
5776   if (XElemTy->isIntegerTy()) {
5777     LoadInst *XLD =
5778         Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
5779     XLD->setAtomic(AO);
5780     XRead = cast<Value>(XLD);
5781   } else {
5782     // We need to perform atomic op as integer
5783     IntegerType *IntCastTy =
5784         IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
5785     LoadInst *XLoad =
5786         Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
5787     XLoad->setAtomic(AO);
5788     if (XElemTy->isFloatingPointTy()) {
5789       XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
5790     } else {
5791       XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
5792     }
5793   }
5794   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
5795   Builder.CreateStore(XRead, V.Var, V.IsVolatile);
5796   return Builder.saveIP();
5797 }
5798 
5799 OpenMPIRBuilder::InsertPointTy
5800 OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
5801                                    AtomicOpValue &X, Value *Expr,
5802                                    AtomicOrdering AO) {
5803   if (!updateToLocation(Loc))
5804     return Loc.IP;
5805 
5806   assert(X.Var->getType()->isPointerTy() &&
5807          "OMP Atomic expects a pointer to target memory");
5808   Type *XElemTy = X.ElemTy;
5809   assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5810           XElemTy->isPointerTy()) &&
5811          "OMP atomic write expected a scalar type");
5812 
5813   if (XElemTy->isIntegerTy()) {
5814     StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
5815     XSt->setAtomic(AO);
5816   } else {
5817     // We need to bitcast and perform atomic op as integers
5818     IntegerType *IntCastTy =
5819         IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
5820     Value *ExprCast =
5821         Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
5822     StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
5823     XSt->setAtomic(AO);
5824   }
5825 
5826   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
5827   return Builder.saveIP();
5828 }
5829 
5830 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate(
5831     const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
5832     Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
5833     AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
5834   assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
5835   if (!updateToLocation(Loc))
5836     return Loc.IP;
5837 
5838   LLVM_DEBUG({
5839     Type *XTy = X.Var->getType();
5840     assert(XTy->isPointerTy() &&
5841            "OMP Atomic expects a pointer to target memory");
5842     Type *XElemTy = X.ElemTy;
5843     assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5844             XElemTy->isPointerTy()) &&
5845            "OMP atomic update expected a scalar type");
5846     assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
5847            (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
5848            "OpenMP atomic does not support LT or GT operations");
5849   });
5850 
5851   emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
5852                    X.IsVolatile, IsXBinopExpr);
5853   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
5854   return Builder.saveIP();
5855 }
5856 
5857 // FIXME: Duplicating AtomicExpand
5858 Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
5859                                                AtomicRMWInst::BinOp RMWOp) {
5860   switch (RMWOp) {
5861   case AtomicRMWInst::Add:
5862     return Builder.CreateAdd(Src1, Src2);
5863   case AtomicRMWInst::Sub:
5864     return Builder.CreateSub(Src1, Src2);
5865   case AtomicRMWInst::And:
5866     return Builder.CreateAnd(Src1, Src2);
5867   case AtomicRMWInst::Nand:
5868     return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
5869   case AtomicRMWInst::Or:
5870     return Builder.CreateOr(Src1, Src2);
5871   case AtomicRMWInst::Xor:
5872     return Builder.CreateXor(Src1, Src2);
5873   case AtomicRMWInst::Xchg:
5874   case AtomicRMWInst::FAdd:
5875   case AtomicRMWInst::FSub:
5876   case AtomicRMWInst::BAD_BINOP:
5877   case AtomicRMWInst::Max:
5878   case AtomicRMWInst::Min:
5879   case AtomicRMWInst::UMax:
5880   case AtomicRMWInst::UMin:
5881   case AtomicRMWInst::FMax:
5882   case AtomicRMWInst::FMin:
5883   case AtomicRMWInst::UIncWrap:
5884   case AtomicRMWInst::UDecWrap:
5885     llvm_unreachable("Unsupported atomic update operation");
5886   }
5887   llvm_unreachable("Unsupported atomic update operation");
5888 }
5889 
5890 std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
5891     InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
5892     AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
5893     AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
5894   // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
5895   // or a complex datatype.
5896   bool emitRMWOp = false;
5897   switch (RMWOp) {
5898   case AtomicRMWInst::Add:
5899   case AtomicRMWInst::And:
5900   case AtomicRMWInst::Nand:
5901   case AtomicRMWInst::Or:
5902   case AtomicRMWInst::Xor:
5903   case AtomicRMWInst::Xchg:
5904     emitRMWOp = XElemTy;
5905     break;
5906   case AtomicRMWInst::Sub:
5907     emitRMWOp = (IsXBinopExpr && XElemTy);
5908     break;
5909   default:
5910     emitRMWOp = false;
5911   }
5912   emitRMWOp &= XElemTy->isIntegerTy();
5913 
5914   std::pair<Value *, Value *> Res;
5915   if (emitRMWOp) {
5916     Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
5917     // not needed except in case of postfix captures. Generate anyway for
5918     // consistency with the else part. Will be removed with any DCE pass.
5919     // AtomicRMWInst::Xchg does not have a coressponding instruction.
5920     if (RMWOp == AtomicRMWInst::Xchg)
5921       Res.second = Res.first;
5922     else
5923       Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
5924   } else {
5925     IntegerType *IntCastTy =
5926         IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
5927     LoadInst *OldVal =
5928         Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
5929     OldVal->setAtomic(AO);
5930     // CurBB
5931     // |     /---\
5932 		// ContBB    |
5933     // |     \---/
5934     // ExitBB
5935     BasicBlock *CurBB = Builder.GetInsertBlock();
5936     Instruction *CurBBTI = CurBB->getTerminator();
5937     CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
5938     BasicBlock *ExitBB =
5939         CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
5940     BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
5941                                                 X->getName() + ".atomic.cont");
5942     ContBB->getTerminator()->eraseFromParent();
5943     Builder.restoreIP(AllocaIP);
5944     AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
5945     NewAtomicAddr->setName(X->getName() + "x.new.val");
5946     Builder.SetInsertPoint(ContBB);
5947     llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
5948     PHI->addIncoming(OldVal, CurBB);
5949     bool IsIntTy = XElemTy->isIntegerTy();
5950     Value *OldExprVal = PHI;
5951     if (!IsIntTy) {
5952       if (XElemTy->isFloatingPointTy()) {
5953         OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
5954                                            X->getName() + ".atomic.fltCast");
5955       } else {
5956         OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
5957                                             X->getName() + ".atomic.ptrCast");
5958       }
5959     }
5960 
5961     Value *Upd = UpdateOp(OldExprVal, Builder);
5962     Builder.CreateStore(Upd, NewAtomicAddr);
5963     LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
5964     AtomicOrdering Failure =
5965         llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
5966     AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
5967         X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
5968     Result->setVolatile(VolatileX);
5969     Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
5970     Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
5971     PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
5972     Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
5973 
5974     Res.first = OldExprVal;
5975     Res.second = Upd;
5976 
5977     // set Insertion point in exit block
5978     if (UnreachableInst *ExitTI =
5979             dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
5980       CurBBTI->eraseFromParent();
5981       Builder.SetInsertPoint(ExitBB);
5982     } else {
5983       Builder.SetInsertPoint(ExitTI);
5984     }
5985   }
5986 
5987   return Res;
5988 }
5989 
5990 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture(
5991     const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
5992     AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
5993     AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
5994     bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
5995   if (!updateToLocation(Loc))
5996     return Loc.IP;
5997 
5998   LLVM_DEBUG({
5999     Type *XTy = X.Var->getType();
6000     assert(XTy->isPointerTy() &&
6001            "OMP Atomic expects a pointer to target memory");
6002     Type *XElemTy = X.ElemTy;
6003     assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
6004             XElemTy->isPointerTy()) &&
6005            "OMP atomic capture expected a scalar type");
6006     assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
6007            "OpenMP atomic does not support LT or GT operations");
6008   });
6009 
6010   // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
6011   // 'x' is simply atomically rewritten with 'expr'.
6012   AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
6013   std::pair<Value *, Value *> Result =
6014       emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
6015                        X.IsVolatile, IsXBinopExpr);
6016 
6017   Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
6018   Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
6019 
6020   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
6021   return Builder.saveIP();
6022 }
6023 
6024 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
6025     const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
6026     AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
6027     omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6028     bool IsFailOnly) {
6029 
6030   AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO);
6031   return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
6032                              IsPostfixUpdate, IsFailOnly, Failure);
6033 }
6034 
6035 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
6036     const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
6037     AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
6038     omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6039     bool IsFailOnly, AtomicOrdering Failure) {
6040 
6041   if (!updateToLocation(Loc))
6042     return Loc.IP;
6043 
6044   assert(X.Var->getType()->isPointerTy() &&
6045          "OMP atomic expects a pointer to target memory");
6046   // compare capture
6047   if (V.Var) {
6048     assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
6049     assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
6050   }
6051 
6052   bool IsInteger = E->getType()->isIntegerTy();
6053 
6054   if (Op == OMPAtomicCompareOp::EQ) {
6055     AtomicCmpXchgInst *Result = nullptr;
6056     if (!IsInteger) {
6057       IntegerType *IntCastTy =
6058           IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
6059       Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
6060       Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
6061       Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
6062                                            AO, Failure);
6063     } else {
6064       Result =
6065           Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
6066     }
6067 
6068     if (V.Var) {
6069       Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
6070       if (!IsInteger)
6071         OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
6072       assert(OldValue->getType() == V.ElemTy &&
6073              "OldValue and V must be of same type");
6074       if (IsPostfixUpdate) {
6075         Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
6076       } else {
6077         Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6078         if (IsFailOnly) {
6079           // CurBB----
6080           //   |     |
6081           //   v     |
6082           // ContBB  |
6083           //   |     |
6084           //   v     |
6085           // ExitBB <-
6086           //
6087           // where ContBB only contains the store of old value to 'v'.
6088           BasicBlock *CurBB = Builder.GetInsertBlock();
6089           Instruction *CurBBTI = CurBB->getTerminator();
6090           CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
6091           BasicBlock *ExitBB = CurBB->splitBasicBlock(
6092               CurBBTI, X.Var->getName() + ".atomic.exit");
6093           BasicBlock *ContBB = CurBB->splitBasicBlock(
6094               CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
6095           ContBB->getTerminator()->eraseFromParent();
6096           CurBB->getTerminator()->eraseFromParent();
6097 
6098           Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
6099 
6100           Builder.SetInsertPoint(ContBB);
6101           Builder.CreateStore(OldValue, V.Var);
6102           Builder.CreateBr(ExitBB);
6103 
6104           if (UnreachableInst *ExitTI =
6105                   dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
6106             CurBBTI->eraseFromParent();
6107             Builder.SetInsertPoint(ExitBB);
6108           } else {
6109             Builder.SetInsertPoint(ExitTI);
6110           }
6111         } else {
6112           Value *CapturedValue =
6113               Builder.CreateSelect(SuccessOrFail, E, OldValue);
6114           Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6115         }
6116       }
6117     }
6118     // The comparison result has to be stored.
6119     if (R.Var) {
6120       assert(R.Var->getType()->isPointerTy() &&
6121              "r.var must be of pointer type");
6122       assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
6123 
6124       Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6125       Value *ResultCast = R.IsSigned
6126                               ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
6127                               : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
6128       Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
6129     }
6130   } else {
6131     assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
6132            "Op should be either max or min at this point");
6133     assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
6134 
6135     // Reverse the ordop as the OpenMP forms are different from LLVM forms.
6136     // Let's take max as example.
6137     // OpenMP form:
6138     // x = x > expr ? expr : x;
6139     // LLVM form:
6140     // *ptr = *ptr > val ? *ptr : val;
6141     // We need to transform to LLVM form.
6142     // x = x <= expr ? x : expr;
6143     AtomicRMWInst::BinOp NewOp;
6144     if (IsXBinopExpr) {
6145       if (IsInteger) {
6146         if (X.IsSigned)
6147           NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
6148                                                 : AtomicRMWInst::Max;
6149         else
6150           NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
6151                                                 : AtomicRMWInst::UMax;
6152       } else {
6153         NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
6154                                               : AtomicRMWInst::FMax;
6155       }
6156     } else {
6157       if (IsInteger) {
6158         if (X.IsSigned)
6159           NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
6160                                                 : AtomicRMWInst::Min;
6161         else
6162           NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
6163                                                 : AtomicRMWInst::UMin;
6164       } else {
6165         NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
6166                                               : AtomicRMWInst::FMin;
6167       }
6168     }
6169 
6170     AtomicRMWInst *OldValue =
6171         Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
6172     if (V.Var) {
6173       Value *CapturedValue = nullptr;
6174       if (IsPostfixUpdate) {
6175         CapturedValue = OldValue;
6176       } else {
6177         CmpInst::Predicate Pred;
6178         switch (NewOp) {
6179         case AtomicRMWInst::Max:
6180           Pred = CmpInst::ICMP_SGT;
6181           break;
6182         case AtomicRMWInst::UMax:
6183           Pred = CmpInst::ICMP_UGT;
6184           break;
6185         case AtomicRMWInst::FMax:
6186           Pred = CmpInst::FCMP_OGT;
6187           break;
6188         case AtomicRMWInst::Min:
6189           Pred = CmpInst::ICMP_SLT;
6190           break;
6191         case AtomicRMWInst::UMin:
6192           Pred = CmpInst::ICMP_ULT;
6193           break;
6194         case AtomicRMWInst::FMin:
6195           Pred = CmpInst::FCMP_OLT;
6196           break;
6197         default:
6198           llvm_unreachable("unexpected comparison op");
6199         }
6200         Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
6201         CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
6202       }
6203       Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6204     }
6205   }
6206 
6207   checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
6208 
6209   return Builder.saveIP();
6210 }
6211 
6212 OpenMPIRBuilder::InsertPointTy
6213 OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
6214                              BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
6215                              Value *NumTeamsUpper, Value *ThreadLimit,
6216                              Value *IfExpr) {
6217   if (!updateToLocation(Loc))
6218     return InsertPointTy();
6219 
6220   uint32_t SrcLocStrSize;
6221   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6222   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6223   Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
6224 
6225   // Outer allocation basicblock is the entry block of the current function.
6226   BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
6227   if (&OuterAllocaBB == Builder.GetInsertBlock()) {
6228     BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
6229     Builder.SetInsertPoint(BodyBB, BodyBB->begin());
6230   }
6231 
6232   // The current basic block is split into four basic blocks. After outlining,
6233   // they will be mapped as follows:
6234   // ```
6235   // def current_fn() {
6236   //   current_basic_block:
6237   //     br label %teams.exit
6238   //   teams.exit:
6239   //     ; instructions after teams
6240   // }
6241   //
6242   // def outlined_fn() {
6243   //   teams.alloca:
6244   //     br label %teams.body
6245   //   teams.body:
6246   //     ; instructions within teams body
6247   // }
6248   // ```
6249   BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
6250   BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
6251   BasicBlock *AllocaBB =
6252       splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
6253 
6254   // Push num_teams
6255   if (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr) {
6256     assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
6257            "if lowerbound is non-null, then upperbound must also be non-null "
6258            "for bounds on num_teams");
6259 
6260     if (NumTeamsUpper == nullptr)
6261       NumTeamsUpper = Builder.getInt32(0);
6262 
6263     if (NumTeamsLower == nullptr)
6264       NumTeamsLower = NumTeamsUpper;
6265 
6266     if (IfExpr) {
6267       assert(IfExpr->getType()->isIntegerTy() &&
6268              "argument to if clause must be an integer value");
6269 
6270       // upper = ifexpr ? upper : 1
6271       if (IfExpr->getType() != Int1)
6272         IfExpr = Builder.CreateICmpNE(IfExpr,
6273                                       ConstantInt::get(IfExpr->getType(), 0));
6274       NumTeamsUpper = Builder.CreateSelect(
6275           IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
6276 
6277       // lower = ifexpr ? lower : 1
6278       NumTeamsLower = Builder.CreateSelect(
6279           IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
6280     }
6281 
6282     if (ThreadLimit == nullptr)
6283       ThreadLimit = Builder.getInt32(0);
6284 
6285     Value *ThreadNum = getOrCreateThreadID(Ident);
6286     Builder.CreateCall(
6287         getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
6288         {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
6289   }
6290   // Generate the body of teams.
6291   InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
6292   InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
6293   BodyGenCB(AllocaIP, CodeGenIP);
6294 
6295   OutlineInfo OI;
6296   OI.EntryBB = AllocaBB;
6297   OI.ExitBB = ExitBB;
6298   OI.OuterAllocaBB = &OuterAllocaBB;
6299 
6300   // Insert fake values for global tid and bound tid.
6301   std::stack<Instruction *> ToBeDeleted;
6302   InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
6303   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
6304       Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
6305   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
6306       Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
6307 
6308   OI.PostOutlineCB = [this, Ident, ToBeDeleted](Function &OutlinedFn) mutable {
6309     // The stale call instruction will be replaced with a new call instruction
6310     // for runtime call with the outlined function.
6311 
6312     assert(OutlinedFn.getNumUses() == 1 &&
6313            "there must be a single user for the outlined function");
6314     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6315     ToBeDeleted.push(StaleCI);
6316 
6317     assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
6318            "Outlined function must have two or three arguments only");
6319 
6320     bool HasShared = OutlinedFn.arg_size() == 3;
6321 
6322     OutlinedFn.getArg(0)->setName("global.tid.ptr");
6323     OutlinedFn.getArg(1)->setName("bound.tid.ptr");
6324     if (HasShared)
6325       OutlinedFn.getArg(2)->setName("data");
6326 
6327     // Call to the runtime function for teams in the current function.
6328     assert(StaleCI && "Error while outlining - no CallInst user found for the "
6329                       "outlined function.");
6330     Builder.SetInsertPoint(StaleCI);
6331     SmallVector<Value *> Args = {
6332         Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
6333     if (HasShared)
6334       Args.push_back(StaleCI->getArgOperand(2));
6335     Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
6336                            omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
6337                        Args);
6338 
6339     while (!ToBeDeleted.empty()) {
6340       ToBeDeleted.top()->eraseFromParent();
6341       ToBeDeleted.pop();
6342     }
6343   };
6344 
6345   addOutlineInfo(std::move(OI));
6346 
6347   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
6348 
6349   return Builder.saveIP();
6350 }
6351 
6352 GlobalVariable *
6353 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
6354                                        std::string VarName) {
6355   llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
6356       llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()),
6357                            Names.size()),
6358       Names);
6359   auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
6360       M, MapNamesArrayInit->getType(),
6361       /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
6362       VarName);
6363   return MapNamesArrayGlobal;
6364 }
6365 
6366 // Create all simple and struct types exposed by the runtime and remember
6367 // the llvm::PointerTypes of them for easy access later.
6368 void OpenMPIRBuilder::initializeTypes(Module &M) {
6369   LLVMContext &Ctx = M.getContext();
6370   StructType *T;
6371 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
6372 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
6373   VarName##Ty = ArrayType::get(ElemTy, ArraySize);                             \
6374   VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
6375 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
6376   VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
6377   VarName##Ptr = PointerType::getUnqual(VarName);
6378 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...)                      \
6379   T = StructType::getTypeByName(Ctx, StructName);                              \
6380   if (!T)                                                                      \
6381     T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed);            \
6382   VarName = T;                                                                 \
6383   VarName##Ptr = PointerType::getUnqual(T);
6384 #include "llvm/Frontend/OpenMP/OMPKinds.def"
6385 }
6386 
6387 void OpenMPIRBuilder::OutlineInfo::collectBlocks(
6388     SmallPtrSetImpl<BasicBlock *> &BlockSet,
6389     SmallVectorImpl<BasicBlock *> &BlockVector) {
6390   SmallVector<BasicBlock *, 32> Worklist;
6391   BlockSet.insert(EntryBB);
6392   BlockSet.insert(ExitBB);
6393 
6394   Worklist.push_back(EntryBB);
6395   while (!Worklist.empty()) {
6396     BasicBlock *BB = Worklist.pop_back_val();
6397     BlockVector.push_back(BB);
6398     for (BasicBlock *SuccBB : successors(BB))
6399       if (BlockSet.insert(SuccBB).second)
6400         Worklist.push_back(SuccBB);
6401   }
6402 }
6403 
6404 void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
6405                                          uint64_t Size, int32_t Flags,
6406                                          GlobalValue::LinkageTypes,
6407                                          StringRef Name) {
6408   if (!Config.isGPU()) {
6409     llvm::offloading::emitOffloadingEntry(
6410         M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
6411         "omp_offloading_entries");
6412     return;
6413   }
6414   // TODO: Add support for global variables on the device after declare target
6415   // support.
6416   Function *Fn = dyn_cast<Function>(Addr);
6417   if (!Fn)
6418     return;
6419 
6420   Module &M = *(Fn->getParent());
6421   LLVMContext &Ctx = M.getContext();
6422 
6423   // Get "nvvm.annotations" metadata node.
6424   NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6425 
6426   Metadata *MDVals[] = {
6427       ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
6428       ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
6429   // Append metadata to nvvm.annotations.
6430   MD->addOperand(MDNode::get(Ctx, MDVals));
6431 
6432   // Add a function attribute for the kernel.
6433   Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
6434   if (T.isAMDGCN())
6435     Fn->addFnAttr("uniform-work-group-size", "true");
6436   Fn->addFnAttr(Attribute::MustProgress);
6437 }
6438 
6439 // We only generate metadata for function that contain target regions.
6440 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
6441     EmitMetadataErrorReportFunctionTy &ErrorFn) {
6442 
6443   // If there are no entries, we don't need to do anything.
6444   if (OffloadInfoManager.empty())
6445     return;
6446 
6447   LLVMContext &C = M.getContext();
6448   SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
6449                         TargetRegionEntryInfo>,
6450               16>
6451       OrderedEntries(OffloadInfoManager.size());
6452 
6453   // Auxiliary methods to create metadata values and strings.
6454   auto &&GetMDInt = [this](unsigned V) {
6455     return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
6456   };
6457 
6458   auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
6459 
6460   // Create the offloading info metadata node.
6461   NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
6462   auto &&TargetRegionMetadataEmitter =
6463       [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
6464           const TargetRegionEntryInfo &EntryInfo,
6465           const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
6466         // Generate metadata for target regions. Each entry of this metadata
6467         // contains:
6468         // - Entry 0 -> Kind of this type of metadata (0).
6469         // - Entry 1 -> Device ID of the file where the entry was identified.
6470         // - Entry 2 -> File ID of the file where the entry was identified.
6471         // - Entry 3 -> Mangled name of the function where the entry was
6472         // identified.
6473         // - Entry 4 -> Line in the file where the entry was identified.
6474         // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
6475         // - Entry 6 -> Order the entry was created.
6476         // The first element of the metadata node is the kind.
6477         Metadata *Ops[] = {
6478             GetMDInt(E.getKind()),      GetMDInt(EntryInfo.DeviceID),
6479             GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
6480             GetMDInt(EntryInfo.Line),   GetMDInt(EntryInfo.Count),
6481             GetMDInt(E.getOrder())};
6482 
6483         // Save this entry in the right position of the ordered entries array.
6484         OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
6485 
6486         // Add metadata to the named metadata node.
6487         MD->addOperand(MDNode::get(C, Ops));
6488       };
6489 
6490   OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
6491 
6492   // Create function that emits metadata for each device global variable entry;
6493   auto &&DeviceGlobalVarMetadataEmitter =
6494       [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
6495           StringRef MangledName,
6496           const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
6497         // Generate metadata for global variables. Each entry of this metadata
6498         // contains:
6499         // - Entry 0 -> Kind of this type of metadata (1).
6500         // - Entry 1 -> Mangled name of the variable.
6501         // - Entry 2 -> Declare target kind.
6502         // - Entry 3 -> Order the entry was created.
6503         // The first element of the metadata node is the kind.
6504         Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
6505                            GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
6506 
6507         // Save this entry in the right position of the ordered entries array.
6508         TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
6509         OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
6510 
6511         // Add metadata to the named metadata node.
6512         MD->addOperand(MDNode::get(C, Ops));
6513       };
6514 
6515   OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
6516       DeviceGlobalVarMetadataEmitter);
6517 
6518   for (const auto &E : OrderedEntries) {
6519     assert(E.first && "All ordered entries must exist!");
6520     if (const auto *CE =
6521             dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
6522                 E.first)) {
6523       if (!CE->getID() || !CE->getAddress()) {
6524         // Do not blame the entry if the parent funtion is not emitted.
6525         TargetRegionEntryInfo EntryInfo = E.second;
6526         StringRef FnName = EntryInfo.ParentName;
6527         if (!M.getNamedValue(FnName))
6528           continue;
6529         ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
6530         continue;
6531       }
6532       createOffloadEntry(CE->getID(), CE->getAddress(),
6533                          /*Size=*/0, CE->getFlags(),
6534                          GlobalValue::WeakAnyLinkage);
6535     } else if (const auto *CE = dyn_cast<
6536                    OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
6537                    E.first)) {
6538       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
6539           static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
6540               CE->getFlags());
6541       switch (Flags) {
6542       case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
6543       case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
6544         if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
6545           continue;
6546         if (!CE->getAddress()) {
6547           ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
6548           continue;
6549         }
6550         // The vaiable has no definition - no need to add the entry.
6551         if (CE->getVarSize() == 0)
6552           continue;
6553         break;
6554       case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
6555         assert(((Config.isTargetDevice() && !CE->getAddress()) ||
6556                 (!Config.isTargetDevice() && CE->getAddress())) &&
6557                "Declaret target link address is set.");
6558         if (Config.isTargetDevice())
6559           continue;
6560         if (!CE->getAddress()) {
6561           ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
6562           continue;
6563         }
6564         break;
6565       default:
6566         break;
6567       }
6568 
6569       // Hidden or internal symbols on the device are not externally visible.
6570       // We should not attempt to register them by creating an offloading
6571       // entry. Indirect variables are handled separately on the device.
6572       if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
6573         if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
6574             Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
6575           continue;
6576 
6577       // Indirect globals need to use a special name that doesn't match the name
6578       // of the associated host global.
6579       if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
6580         createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6581                            Flags, CE->getLinkage(), CE->getVarName());
6582       else
6583         createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6584                            Flags, CE->getLinkage());
6585 
6586     } else {
6587       llvm_unreachable("Unsupported entry kind.");
6588     }
6589   }
6590 }
6591 
6592 void TargetRegionEntryInfo::getTargetRegionEntryFnName(
6593     SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
6594     unsigned FileID, unsigned Line, unsigned Count) {
6595   raw_svector_ostream OS(Name);
6596   OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
6597      << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
6598   if (Count)
6599     OS << "_" << Count;
6600 }
6601 
6602 void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
6603     SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
6604   unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
6605   TargetRegionEntryInfo::getTargetRegionEntryFnName(
6606       Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
6607       EntryInfo.Line, NewCount);
6608 }
6609 
6610 TargetRegionEntryInfo
6611 OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
6612                                           StringRef ParentName) {
6613   sys::fs::UniqueID ID;
6614   auto FileIDInfo = CallBack();
6615   if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
6616     report_fatal_error(("Unable to get unique ID for file, during "
6617                         "getTargetEntryUniqueInfo, error message: " +
6618                         EC.message())
6619                            .c_str());
6620   }
6621 
6622   return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
6623                                std::get<1>(FileIDInfo));
6624 }
6625 
6626 unsigned OpenMPIRBuilder::getFlagMemberOffset() {
6627   unsigned Offset = 0;
6628   for (uint64_t Remain =
6629            static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6630                omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF);
6631        !(Remain & 1); Remain = Remain >> 1)
6632     Offset++;
6633   return Offset;
6634 }
6635 
6636 omp::OpenMPOffloadMappingFlags
6637 OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
6638   // Rotate by getFlagMemberOffset() bits.
6639   return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
6640                                                      << getFlagMemberOffset());
6641 }
6642 
6643 void OpenMPIRBuilder::setCorrectMemberOfFlag(
6644     omp::OpenMPOffloadMappingFlags &Flags,
6645     omp::OpenMPOffloadMappingFlags MemberOfFlag) {
6646   // If the entry is PTR_AND_OBJ but has not been marked with the special
6647   // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
6648   // marked as MEMBER_OF.
6649   if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6650           Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) &&
6651       static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6652           (Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) !=
6653           omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF))
6654     return;
6655 
6656   // Reset the placeholder value to prepare the flag for the assignment of the
6657   // proper MEMBER_OF value.
6658   Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
6659   Flags |= MemberOfFlag;
6660 }
6661 
6662 Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
6663     OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
6664     OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
6665     bool IsDeclaration, bool IsExternallyVisible,
6666     TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6667     std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6668     std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
6669     std::function<Constant *()> GlobalInitializer,
6670     std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
6671   // TODO: convert this to utilise the IRBuilder Config rather than
6672   // a passed down argument.
6673   if (OpenMPSIMD)
6674     return nullptr;
6675 
6676   if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
6677       ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
6678         CaptureClause ==
6679             OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
6680        Config.hasRequiresUnifiedSharedMemory())) {
6681     SmallString<64> PtrName;
6682     {
6683       raw_svector_ostream OS(PtrName);
6684       OS << MangledName;
6685       if (!IsExternallyVisible)
6686         OS << format("_%x", EntryInfo.FileID);
6687       OS << "_decl_tgt_ref_ptr";
6688     }
6689 
6690     Value *Ptr = M.getNamedValue(PtrName);
6691 
6692     if (!Ptr) {
6693       GlobalValue *GlobalValue = M.getNamedValue(MangledName);
6694       Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
6695 
6696       auto *GV = cast<GlobalVariable>(Ptr);
6697       GV->setLinkage(GlobalValue::WeakAnyLinkage);
6698 
6699       if (!Config.isTargetDevice()) {
6700         if (GlobalInitializer)
6701           GV->setInitializer(GlobalInitializer());
6702         else
6703           GV->setInitializer(GlobalValue);
6704       }
6705 
6706       registerTargetGlobalVariable(
6707           CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6708           EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6709           GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
6710     }
6711 
6712     return cast<Constant>(Ptr);
6713   }
6714 
6715   return nullptr;
6716 }
6717 
6718 void OpenMPIRBuilder::registerTargetGlobalVariable(
6719     OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
6720     OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
6721     bool IsDeclaration, bool IsExternallyVisible,
6722     TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6723     std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6724     std::vector<Triple> TargetTriple,
6725     std::function<Constant *()> GlobalInitializer,
6726     std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
6727     Constant *Addr) {
6728   if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
6729       (TargetTriple.empty() && !Config.isTargetDevice()))
6730     return;
6731 
6732   OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
6733   StringRef VarName;
6734   int64_t VarSize;
6735   GlobalValue::LinkageTypes Linkage;
6736 
6737   if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
6738        CaptureClause ==
6739            OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
6740       !Config.hasRequiresUnifiedSharedMemory()) {
6741     Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
6742     VarName = MangledName;
6743     GlobalValue *LlvmVal = M.getNamedValue(VarName);
6744 
6745     if (!IsDeclaration)
6746       VarSize = divideCeil(
6747           M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
6748     else
6749       VarSize = 0;
6750     Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
6751 
6752     // This is a workaround carried over from Clang which prevents undesired
6753     // optimisation of internal variables.
6754     if (Config.isTargetDevice() &&
6755         (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
6756       // Do not create a "ref-variable" if the original is not also available
6757       // on the host.
6758       if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
6759         return;
6760 
6761       std::string RefName = createPlatformSpecificName({VarName, "ref"});
6762 
6763       if (!M.getNamedValue(RefName)) {
6764         Constant *AddrRef =
6765             getOrCreateInternalVariable(Addr->getType(), RefName);
6766         auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
6767         GvAddrRef->setConstant(true);
6768         GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
6769         GvAddrRef->setInitializer(Addr);
6770         GeneratedRefs.push_back(GvAddrRef);
6771       }
6772     }
6773   } else {
6774     if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
6775       Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
6776     else
6777       Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
6778 
6779     if (Config.isTargetDevice()) {
6780       VarName = (Addr) ? Addr->getName() : "";
6781       Addr = nullptr;
6782     } else {
6783       Addr = getAddrOfDeclareTargetVar(
6784           CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6785           EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6786           LlvmPtrTy, GlobalInitializer, VariableLinkage);
6787       VarName = (Addr) ? Addr->getName() : "";
6788     }
6789     VarSize = M.getDataLayout().getPointerSize();
6790     Linkage = GlobalValue::WeakAnyLinkage;
6791   }
6792 
6793   OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
6794                                                       Flags, Linkage);
6795 }
6796 
6797 /// Loads all the offload entries information from the host IR
6798 /// metadata.
6799 void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
6800   // If we are in target mode, load the metadata from the host IR. This code has
6801   // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
6802 
6803   NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
6804   if (!MD)
6805     return;
6806 
6807   for (MDNode *MN : MD->operands()) {
6808     auto &&GetMDInt = [MN](unsigned Idx) {
6809       auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
6810       return cast<ConstantInt>(V->getValue())->getZExtValue();
6811     };
6812 
6813     auto &&GetMDString = [MN](unsigned Idx) {
6814       auto *V = cast<MDString>(MN->getOperand(Idx));
6815       return V->getString();
6816     };
6817 
6818     switch (GetMDInt(0)) {
6819     default:
6820       llvm_unreachable("Unexpected metadata!");
6821       break;
6822     case OffloadEntriesInfoManager::OffloadEntryInfo::
6823         OffloadingEntryInfoTargetRegion: {
6824       TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
6825                                       /*DeviceID=*/GetMDInt(1),
6826                                       /*FileID=*/GetMDInt(2),
6827                                       /*Line=*/GetMDInt(4),
6828                                       /*Count=*/GetMDInt(5));
6829       OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
6830                                                          /*Order=*/GetMDInt(6));
6831       break;
6832     }
6833     case OffloadEntriesInfoManager::OffloadEntryInfo::
6834         OffloadingEntryInfoDeviceGlobalVar:
6835       OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
6836           /*MangledName=*/GetMDString(1),
6837           static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
6838               /*Flags=*/GetMDInt(2)),
6839           /*Order=*/GetMDInt(3));
6840       break;
6841     }
6842   }
6843 }
6844 
6845 void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) {
6846   if (HostFilePath.empty())
6847     return;
6848 
6849   auto Buf = MemoryBuffer::getFile(HostFilePath);
6850   if (std::error_code Err = Buf.getError()) {
6851     report_fatal_error(("error opening host file from host file path inside of "
6852                         "OpenMPIRBuilder: " +
6853                         Err.message())
6854                            .c_str());
6855   }
6856 
6857   LLVMContext Ctx;
6858   auto M = expectedToErrorOrAndEmitErrors(
6859       Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
6860   if (std::error_code Err = M.getError()) {
6861     report_fatal_error(
6862         ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
6863             .c_str());
6864   }
6865 
6866   loadOffloadInfoMetadata(*M.get());
6867 }
6868 
6869 Function *OpenMPIRBuilder::createRegisterRequires(StringRef Name) {
6870   // Skip the creation of the registration function if this is device codegen
6871   if (Config.isTargetDevice())
6872     return nullptr;
6873 
6874   Builder.ClearInsertionPoint();
6875 
6876   // Create registration function prototype
6877   auto *RegFnTy = FunctionType::get(Builder.getVoidTy(), {});
6878   auto *RegFn = Function::Create(
6879       RegFnTy, GlobalVariable::LinkageTypes::InternalLinkage, Name, M);
6880   RegFn->setSection(".text.startup");
6881   RegFn->addFnAttr(Attribute::NoInline);
6882   RegFn->addFnAttr(Attribute::NoUnwind);
6883 
6884   // Create registration function body
6885   auto *BB = BasicBlock::Create(M.getContext(), "entry", RegFn);
6886   ConstantInt *FlagsVal =
6887       ConstantInt::getSigned(Builder.getInt64Ty(), Config.getRequiresFlags());
6888   Function *RTLRegFn = getOrCreateRuntimeFunctionPtr(
6889       omp::RuntimeFunction::OMPRTL___tgt_register_requires);
6890 
6891   Builder.SetInsertPoint(BB);
6892   Builder.CreateCall(RTLRegFn, {FlagsVal});
6893   Builder.CreateRetVoid();
6894 
6895   return RegFn;
6896 }
6897 
6898 //===----------------------------------------------------------------------===//
6899 // OffloadEntriesInfoManager
6900 //===----------------------------------------------------------------------===//
6901 
6902 bool OffloadEntriesInfoManager::empty() const {
6903   return OffloadEntriesTargetRegion.empty() &&
6904          OffloadEntriesDeviceGlobalVar.empty();
6905 }
6906 
6907 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
6908     const TargetRegionEntryInfo &EntryInfo) const {
6909   auto It = OffloadEntriesTargetRegionCount.find(
6910       getTargetRegionEntryCountKey(EntryInfo));
6911   if (It == OffloadEntriesTargetRegionCount.end())
6912     return 0;
6913   return It->second;
6914 }
6915 
6916 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
6917     const TargetRegionEntryInfo &EntryInfo) {
6918   OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
6919       EntryInfo.Count + 1;
6920 }
6921 
6922 /// Initialize target region entry.
6923 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
6924     const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
6925   OffloadEntriesTargetRegion[EntryInfo] =
6926       OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
6927                                    OMPTargetRegionEntryTargetRegion);
6928   ++OffloadingEntriesNum;
6929 }
6930 
6931 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
6932     TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
6933     OMPTargetRegionEntryKind Flags) {
6934   assert(EntryInfo.Count == 0 && "expected default EntryInfo");
6935 
6936   // Update the EntryInfo with the next available count for this location.
6937   EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
6938 
6939   // If we are emitting code for a target, the entry is already initialized,
6940   // only has to be registered.
6941   if (OMPBuilder->Config.isTargetDevice()) {
6942     // This could happen if the device compilation is invoked standalone.
6943     if (!hasTargetRegionEntryInfo(EntryInfo)) {
6944       return;
6945     }
6946     auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
6947     Entry.setAddress(Addr);
6948     Entry.setID(ID);
6949     Entry.setFlags(Flags);
6950   } else {
6951     if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
6952         hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
6953       return;
6954     assert(!hasTargetRegionEntryInfo(EntryInfo) &&
6955            "Target region entry already registered!");
6956     OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
6957     OffloadEntriesTargetRegion[EntryInfo] = Entry;
6958     ++OffloadingEntriesNum;
6959   }
6960   incrementTargetRegionEntryInfoCount(EntryInfo);
6961 }
6962 
6963 bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
6964     TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
6965 
6966   // Update the EntryInfo with the next available count for this location.
6967   EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
6968 
6969   auto It = OffloadEntriesTargetRegion.find(EntryInfo);
6970   if (It == OffloadEntriesTargetRegion.end()) {
6971     return false;
6972   }
6973   // Fail if this entry is already registered.
6974   if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
6975     return false;
6976   return true;
6977 }
6978 
6979 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
6980     const OffloadTargetRegionEntryInfoActTy &Action) {
6981   // Scan all target region entries and perform the provided action.
6982   for (const auto &It : OffloadEntriesTargetRegion) {
6983     Action(It.first, It.second);
6984   }
6985 }
6986 
6987 void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
6988     StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
6989   OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
6990   ++OffloadingEntriesNum;
6991 }
6992 
6993 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
6994     StringRef VarName, Constant *Addr, int64_t VarSize,
6995     OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
6996   if (OMPBuilder->Config.isTargetDevice()) {
6997     // This could happen if the device compilation is invoked standalone.
6998     if (!hasDeviceGlobalVarEntryInfo(VarName))
6999       return;
7000     auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7001     if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
7002       if (Entry.getVarSize() == 0) {
7003         Entry.setVarSize(VarSize);
7004         Entry.setLinkage(Linkage);
7005       }
7006       return;
7007     }
7008     Entry.setVarSize(VarSize);
7009     Entry.setLinkage(Linkage);
7010     Entry.setAddress(Addr);
7011   } else {
7012     if (hasDeviceGlobalVarEntryInfo(VarName)) {
7013       auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7014       assert(Entry.isValid() && Entry.getFlags() == Flags &&
7015              "Entry not initialized!");
7016       if (Entry.getVarSize() == 0) {
7017         Entry.setVarSize(VarSize);
7018         Entry.setLinkage(Linkage);
7019       }
7020       return;
7021     }
7022     if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
7023       OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
7024                                                 Addr, VarSize, Flags, Linkage,
7025                                                 VarName.str());
7026     else
7027       OffloadEntriesDeviceGlobalVar.try_emplace(
7028           VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
7029     ++OffloadingEntriesNum;
7030   }
7031 }
7032 
7033 void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
7034     const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
7035   // Scan all target region entries and perform the provided action.
7036   for (const auto &E : OffloadEntriesDeviceGlobalVar)
7037     Action(E.getKey(), E.getValue());
7038 }
7039 
7040 //===----------------------------------------------------------------------===//
7041 // CanonicalLoopInfo
7042 //===----------------------------------------------------------------------===//
7043 
7044 void CanonicalLoopInfo::collectControlBlocks(
7045     SmallVectorImpl<BasicBlock *> &BBs) {
7046   // We only count those BBs as control block for which we do not need to
7047   // reverse the CFG, i.e. not the loop body which can contain arbitrary control
7048   // flow. For consistency, this also means we do not add the Body block, which
7049   // is just the entry to the body code.
7050   BBs.reserve(BBs.size() + 6);
7051   BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
7052 }
7053 
7054 BasicBlock *CanonicalLoopInfo::getPreheader() const {
7055   assert(isValid() && "Requires a valid canonical loop");
7056   for (BasicBlock *Pred : predecessors(Header)) {
7057     if (Pred != Latch)
7058       return Pred;
7059   }
7060   llvm_unreachable("Missing preheader");
7061 }
7062 
7063 void CanonicalLoopInfo::setTripCount(Value *TripCount) {
7064   assert(isValid() && "Requires a valid canonical loop");
7065 
7066   Instruction *CmpI = &getCond()->front();
7067   assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
7068   CmpI->setOperand(1, TripCount);
7069 
7070 #ifndef NDEBUG
7071   assertOK();
7072 #endif
7073 }
7074 
7075 void CanonicalLoopInfo::mapIndVar(
7076     llvm::function_ref<Value *(Instruction *)> Updater) {
7077   assert(isValid() && "Requires a valid canonical loop");
7078 
7079   Instruction *OldIV = getIndVar();
7080 
7081   // Record all uses excluding those introduced by the updater. Uses by the
7082   // CanonicalLoopInfo itself to keep track of the number of iterations are
7083   // excluded.
7084   SmallVector<Use *> ReplacableUses;
7085   for (Use &U : OldIV->uses()) {
7086     auto *User = dyn_cast<Instruction>(U.getUser());
7087     if (!User)
7088       continue;
7089     if (User->getParent() == getCond())
7090       continue;
7091     if (User->getParent() == getLatch())
7092       continue;
7093     ReplacableUses.push_back(&U);
7094   }
7095 
7096   // Run the updater that may introduce new uses
7097   Value *NewIV = Updater(OldIV);
7098 
7099   // Replace the old uses with the value returned by the updater.
7100   for (Use *U : ReplacableUses)
7101     U->set(NewIV);
7102 
7103 #ifndef NDEBUG
7104   assertOK();
7105 #endif
7106 }
7107 
7108 void CanonicalLoopInfo::assertOK() const {
7109 #ifndef NDEBUG
7110   // No constraints if this object currently does not describe a loop.
7111   if (!isValid())
7112     return;
7113 
7114   BasicBlock *Preheader = getPreheader();
7115   BasicBlock *Body = getBody();
7116   BasicBlock *After = getAfter();
7117 
7118   // Verify standard control-flow we use for OpenMP loops.
7119   assert(Preheader);
7120   assert(isa<BranchInst>(Preheader->getTerminator()) &&
7121          "Preheader must terminate with unconditional branch");
7122   assert(Preheader->getSingleSuccessor() == Header &&
7123          "Preheader must jump to header");
7124 
7125   assert(Header);
7126   assert(isa<BranchInst>(Header->getTerminator()) &&
7127          "Header must terminate with unconditional branch");
7128   assert(Header->getSingleSuccessor() == Cond &&
7129          "Header must jump to exiting block");
7130 
7131   assert(Cond);
7132   assert(Cond->getSinglePredecessor() == Header &&
7133          "Exiting block only reachable from header");
7134 
7135   assert(isa<BranchInst>(Cond->getTerminator()) &&
7136          "Exiting block must terminate with conditional branch");
7137   assert(size(successors(Cond)) == 2 &&
7138          "Exiting block must have two successors");
7139   assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
7140          "Exiting block's first successor jump to the body");
7141   assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
7142          "Exiting block's second successor must exit the loop");
7143 
7144   assert(Body);
7145   assert(Body->getSinglePredecessor() == Cond &&
7146          "Body only reachable from exiting block");
7147   assert(!isa<PHINode>(Body->front()));
7148 
7149   assert(Latch);
7150   assert(isa<BranchInst>(Latch->getTerminator()) &&
7151          "Latch must terminate with unconditional branch");
7152   assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
7153   // TODO: To support simple redirecting of the end of the body code that has
7154   // multiple; introduce another auxiliary basic block like preheader and after.
7155   assert(Latch->getSinglePredecessor() != nullptr);
7156   assert(!isa<PHINode>(Latch->front()));
7157 
7158   assert(Exit);
7159   assert(isa<BranchInst>(Exit->getTerminator()) &&
7160          "Exit block must terminate with unconditional branch");
7161   assert(Exit->getSingleSuccessor() == After &&
7162          "Exit block must jump to after block");
7163 
7164   assert(After);
7165   assert(After->getSinglePredecessor() == Exit &&
7166          "After block only reachable from exit block");
7167   assert(After->empty() || !isa<PHINode>(After->front()));
7168 
7169   Instruction *IndVar = getIndVar();
7170   assert(IndVar && "Canonical induction variable not found?");
7171   assert(isa<IntegerType>(IndVar->getType()) &&
7172          "Induction variable must be an integer");
7173   assert(cast<PHINode>(IndVar)->getParent() == Header &&
7174          "Induction variable must be a PHI in the loop header");
7175   assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
7176   assert(
7177       cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
7178   assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
7179 
7180   auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
7181   assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
7182   assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
7183   assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
7184   assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
7185              ->isOne());
7186 
7187   Value *TripCount = getTripCount();
7188   assert(TripCount && "Loop trip count not found?");
7189   assert(IndVar->getType() == TripCount->getType() &&
7190          "Trip count and induction variable must have the same type");
7191 
7192   auto *CmpI = cast<CmpInst>(&Cond->front());
7193   assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
7194          "Exit condition must be a signed less-than comparison");
7195   assert(CmpI->getOperand(0) == IndVar &&
7196          "Exit condition must compare the induction variable");
7197   assert(CmpI->getOperand(1) == TripCount &&
7198          "Exit condition must compare with the trip count");
7199 #endif
7200 }
7201 
7202 void CanonicalLoopInfo::invalidate() {
7203   Header = nullptr;
7204   Cond = nullptr;
7205   Latch = nullptr;
7206   Exit = nullptr;
7207 }
7208