1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient 10 // way to create LLVM instructions for OpenMP directives. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 16 17 #include "llvm/Frontend/Atomic/Atomic.h" 18 #include "llvm/Frontend/OpenMP/OMPConstants.h" 19 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 20 #include "llvm/IR/DebugLoc.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Module.h" 23 #include "llvm/IR/ValueMap.h" 24 #include "llvm/Support/Allocator.h" 25 #include "llvm/Support/Compiler.h" 26 #include "llvm/Support/Error.h" 27 #include "llvm/TargetParser/Triple.h" 28 #include <forward_list> 29 #include <map> 30 #include <optional> 31 32 namespace llvm { 33 class CanonicalLoopInfo; 34 struct TargetRegionEntryInfo; 35 class OffloadEntriesInfoManager; 36 class OpenMPIRBuilder; 37 class Loop; 38 class LoopAnalysis; 39 class LoopInfo; 40 41 /// Move the instruction after an InsertPoint to the beginning of another 42 /// BasicBlock. 43 /// 44 /// The instructions after \p IP are moved to the beginning of \p New which must 45 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to 46 /// \p New will be added such that there is no semantic change. Otherwise, the 47 /// \p IP insert block remains degenerate and it is up to the caller to insert a 48 /// terminator. \p DL is used as the debug location for the branch instruction 49 /// if one is created. 50 LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 51 bool CreateBranch, DebugLoc DL); 52 53 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new 54 /// insert location will stick to after the instruction before the insertion 55 /// point (instead of moving with the instruction the InsertPoint stores 56 /// internally). 57 LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New, 58 bool CreateBranch); 59 60 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate 61 /// (missing the terminator). 62 /// 63 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed 64 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch 65 /// is true, a branch to the new successor will new created such that 66 /// semantically there is no change; otherwise the block of the insertion point 67 /// remains degenerate and it is the caller's responsibility to insert a 68 /// terminator. \p DL is used as the debug location for the branch instruction 69 /// if one is created. Returns the new successor block. 70 LLVM_ABI BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 71 DebugLoc DL, llvm::Twine Name = {}); 72 73 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 74 /// degenerate (missing the terminator). Its new insert location will stick to 75 /// after the instruction before the insertion point (instead of moving with the 76 /// instruction the InsertPoint stores internally). 77 LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch, 78 llvm::Twine Name = {}); 79 80 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 81 /// degenerate (missing the terminator). Its new insert location will stick to 82 /// after the instruction before the insertion point (instead of moving with the 83 /// instruction the InsertPoint stores internally). 84 LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, 85 llvm::Twine Name); 86 87 /// Like splitBB, but reuses the current block's name for the new name. 88 LLVM_ABI BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, 89 bool CreateBranch, 90 llvm::Twine Suffix = ".split"); 91 92 /// Captures attributes that affect generating LLVM-IR using the 93 /// OpenMPIRBuilder and related classes. Note that not all attributes are 94 /// required for all classes or functions. In some use cases the configuration 95 /// is not necessary at all, because because the only functions that are called 96 /// are ones that are not dependent on the configuration. 97 class OpenMPIRBuilderConfig { 98 public: 99 /// Flag to define whether to generate code for the role of the OpenMP host 100 /// (if set to false) or device (if set to true) in an offloading context. It 101 /// is set when the -fopenmp-is-target-device compiler frontend option is 102 /// specified. 103 std::optional<bool> IsTargetDevice; 104 105 /// Flag for specifying if the compilation is done for an accelerator. It is 106 /// set according to the architecture of the target triple and currently only 107 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform 108 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true 109 /// if `IsGPU` is true. This restriction might be lifted if an accelerator- 110 /// like target with the ability to work as the OpenMP host is added, or if 111 /// the capabilities of the currently supported GPU architectures are 112 /// expanded. 113 std::optional<bool> IsGPU; 114 115 /// Flag for specifying if LLVMUsed information should be emitted. 116 std::optional<bool> EmitLLVMUsedMetaInfo; 117 118 /// Flag for specifying if offloading is mandatory. 119 std::optional<bool> OpenMPOffloadMandatory; 120 121 /// First separator used between the initial two parts of a name. 122 std::optional<StringRef> FirstSeparator; 123 /// Separator used between all of the rest consecutive parts of s name 124 std::optional<StringRef> Separator; 125 126 // Grid Value for the GPU target 127 std::optional<omp::GV> GridValue; 128 129 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice = 130 /// false`), this contains the list of offloading triples associated, if any. 131 SmallVector<Triple> TargetTriples; 132 133 LLVM_ABI OpenMPIRBuilderConfig(); 134 LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU, 135 bool OpenMPOffloadMandatory, 136 bool HasRequiresReverseOffload, 137 bool HasRequiresUnifiedAddress, 138 bool HasRequiresUnifiedSharedMemory, 139 bool HasRequiresDynamicAllocators); 140 141 // Getters functions that assert if the required values are not present. isTargetDevice()142 bool isTargetDevice() const { 143 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set"); 144 return *IsTargetDevice; 145 } 146 isGPU()147 bool isGPU() const { 148 assert(IsGPU.has_value() && "IsGPU is not set"); 149 return *IsGPU; 150 } 151 openMPOffloadMandatory()152 bool openMPOffloadMandatory() const { 153 assert(OpenMPOffloadMandatory.has_value() && 154 "OpenMPOffloadMandatory is not set"); 155 return *OpenMPOffloadMandatory; 156 } 157 getGridValue()158 omp::GV getGridValue() const { 159 assert(GridValue.has_value() && "GridValue is not set"); 160 return *GridValue; 161 } 162 hasRequiresFlags()163 bool hasRequiresFlags() const { return RequiresFlags; } 164 LLVM_ABI bool hasRequiresReverseOffload() const; 165 LLVM_ABI bool hasRequiresUnifiedAddress() const; 166 LLVM_ABI bool hasRequiresUnifiedSharedMemory() const; 167 LLVM_ABI bool hasRequiresDynamicAllocators() const; 168 169 /// Returns requires directive clauses as flags compatible with those expected 170 /// by libomptarget. 171 LLVM_ABI int64_t getRequiresFlags() const; 172 173 // Returns the FirstSeparator if set, otherwise use the default separator 174 // depending on isGPU firstSeparator()175 StringRef firstSeparator() const { 176 if (FirstSeparator.has_value()) 177 return *FirstSeparator; 178 if (isGPU()) 179 return "_"; 180 return "."; 181 } 182 183 // Returns the Separator if set, otherwise use the default separator depending 184 // on isGPU separator()185 StringRef separator() const { 186 if (Separator.has_value()) 187 return *Separator; 188 if (isGPU()) 189 return "$"; 190 return "."; 191 } 192 setIsTargetDevice(bool Value)193 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; } setIsGPU(bool Value)194 void setIsGPU(bool Value) { IsGPU = Value; } 195 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; } setOpenMPOffloadMandatory(bool Value)196 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; } setFirstSeparator(StringRef FS)197 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; } setSeparator(StringRef S)198 void setSeparator(StringRef S) { Separator = S; } setGridValue(omp::GV G)199 void setGridValue(omp::GV G) { GridValue = G; } 200 201 LLVM_ABI void setHasRequiresReverseOffload(bool Value); 202 LLVM_ABI void setHasRequiresUnifiedAddress(bool Value); 203 LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value); 204 LLVM_ABI void setHasRequiresDynamicAllocators(bool Value); 205 206 private: 207 /// Flags for specifying which requires directive clauses are present. 208 int64_t RequiresFlags; 209 }; 210 211 /// Data structure to contain the information needed to uniquely identify 212 /// a target entry. 213 struct TargetRegionEntryInfo { 214 /// The prefix used for kernel names. 215 static constexpr const char *KernelNamePrefix = "__omp_offloading_"; 216 217 std::string ParentName; 218 unsigned DeviceID; 219 unsigned FileID; 220 unsigned Line; 221 unsigned Count; 222 TargetRegionEntryInfoTargetRegionEntryInfo223 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {} 224 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, 225 unsigned FileID, unsigned Line, unsigned Count = 0) ParentNameTargetRegionEntryInfo226 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line), 227 Count(Count) {} 228 229 LLVM_ABI static void 230 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, StringRef ParentName, 231 unsigned DeviceID, unsigned FileID, unsigned Line, 232 unsigned Count); 233 234 bool operator<(const TargetRegionEntryInfo &RHS) const { 235 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) < 236 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line, 237 RHS.Count); 238 } 239 }; 240 241 /// Class that manages information about offload code regions and data 242 class OffloadEntriesInfoManager { 243 /// Number of entries registered so far. 244 OpenMPIRBuilder *OMPBuilder; 245 unsigned OffloadingEntriesNum = 0; 246 247 public: 248 /// Base class of the entries info. 249 class OffloadEntryInfo { 250 public: 251 /// Kind of a given entry. 252 enum OffloadingEntryInfoKinds : unsigned { 253 /// Entry is a target region. 254 OffloadingEntryInfoTargetRegion = 0, 255 /// Entry is a declare target variable. 256 OffloadingEntryInfoDeviceGlobalVar = 1, 257 /// Invalid entry info. 258 OffloadingEntryInfoInvalid = ~0u 259 }; 260 261 protected: 262 OffloadEntryInfo() = delete; OffloadEntryInfo(OffloadingEntryInfoKinds Kind)263 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {} OffloadEntryInfo(OffloadingEntryInfoKinds Kind,unsigned Order,uint32_t Flags)264 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, 265 uint32_t Flags) 266 : Flags(Flags), Order(Order), Kind(Kind) {} 267 ~OffloadEntryInfo() = default; 268 269 public: isValid()270 bool isValid() const { return Order != ~0u; } getOrder()271 unsigned getOrder() const { return Order; } getKind()272 OffloadingEntryInfoKinds getKind() const { return Kind; } getFlags()273 uint32_t getFlags() const { return Flags; } setFlags(uint32_t NewFlags)274 void setFlags(uint32_t NewFlags) { Flags = NewFlags; } getAddress()275 Constant *getAddress() const { return cast_or_null<Constant>(Addr); } setAddress(Constant * V)276 void setAddress(Constant *V) { 277 assert(!Addr.pointsToAliveValue() && "Address has been set before!"); 278 Addr = V; 279 } classof(const OffloadEntryInfo * Info)280 static bool classof(const OffloadEntryInfo *Info) { return true; } 281 282 private: 283 /// Address of the entity that has to be mapped for offloading. 284 WeakTrackingVH Addr; 285 286 /// Flags associated with the device global. 287 uint32_t Flags = 0u; 288 289 /// Order this entry was emitted. 290 unsigned Order = ~0u; 291 292 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid; 293 }; 294 295 /// Return true if a there are no entries defined. 296 LLVM_ABI bool empty() const; 297 /// Return number of entries defined so far. size()298 unsigned size() const { return OffloadingEntriesNum; } 299 OffloadEntriesInfoManager(OpenMPIRBuilder * builder)300 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {} 301 302 // 303 // Target region entries related. 304 // 305 306 /// Kind of the target registry entry. 307 enum OMPTargetRegionEntryKind : uint32_t { 308 /// Mark the entry as target region. 309 OMPTargetRegionEntryTargetRegion = 0x0, 310 }; 311 312 /// Target region entries info. 313 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo { 314 /// Address that can be used as the ID of the entry. 315 Constant *ID = nullptr; 316 317 public: OffloadEntryInfoTargetRegion()318 OffloadEntryInfoTargetRegion() 319 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {} OffloadEntryInfoTargetRegion(unsigned Order,Constant * Addr,Constant * ID,OMPTargetRegionEntryKind Flags)320 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, 321 Constant *ID, 322 OMPTargetRegionEntryKind Flags) 323 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags), 324 ID(ID) { 325 setAddress(Addr); 326 } 327 getID()328 Constant *getID() const { return ID; } setID(Constant * V)329 void setID(Constant *V) { 330 assert(!ID && "ID has been set before!"); 331 ID = V; 332 } classof(const OffloadEntryInfo * Info)333 static bool classof(const OffloadEntryInfo *Info) { 334 return Info->getKind() == OffloadingEntryInfoTargetRegion; 335 } 336 }; 337 338 /// Initialize target region entry. 339 /// This is ONLY needed for DEVICE compilation. 340 LLVM_ABI void 341 initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, 342 unsigned Order); 343 /// Register target region entry. 344 LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 345 Constant *Addr, Constant *ID, 346 OMPTargetRegionEntryKind Flags); 347 /// Return true if a target region entry with the provided information 348 /// exists. 349 LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 350 bool IgnoreAddressId = false) const; 351 352 // Return the Name based on \a EntryInfo using the next available Count. 353 LLVM_ABI void 354 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, 355 const TargetRegionEntryInfo &EntryInfo); 356 357 /// brief Applies action \a Action on all registered entries. 358 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo, 359 const OffloadEntryInfoTargetRegion &)> 360 OffloadTargetRegionEntryInfoActTy; 361 LLVM_ABI void 362 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action); 363 364 // 365 // Device global variable entries related. 366 // 367 368 /// Kind of the global variable entry.. 369 enum OMPTargetGlobalVarEntryKind : uint32_t { 370 /// Mark the entry as a to declare target. 371 OMPTargetGlobalVarEntryTo = 0x0, 372 /// Mark the entry as a to declare target link. 373 OMPTargetGlobalVarEntryLink = 0x1, 374 /// Mark the entry as a declare target enter. 375 OMPTargetGlobalVarEntryEnter = 0x2, 376 /// Mark the entry as having no declare target entry kind. 377 OMPTargetGlobalVarEntryNone = 0x3, 378 /// Mark the entry as a declare target indirect global. 379 OMPTargetGlobalVarEntryIndirect = 0x8, 380 /// Mark the entry as a register requires global. 381 OMPTargetGlobalRegisterRequires = 0x10, 382 }; 383 384 /// Kind of device clause for declare target variables 385 /// and functions 386 /// NOTE: Currently not used as a part of a variable entry 387 /// used for Flang and Clang to interface with the variable 388 /// related registration functions 389 enum OMPTargetDeviceClauseKind : uint32_t { 390 /// The target is marked for all devices 391 OMPTargetDeviceClauseAny = 0x0, 392 /// The target is marked for non-host devices 393 OMPTargetDeviceClauseNoHost = 0x1, 394 /// The target is marked for host devices 395 OMPTargetDeviceClauseHost = 0x2, 396 /// The target is marked as having no clause 397 OMPTargetDeviceClauseNone = 0x3 398 }; 399 400 /// Device global variable entries info. 401 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo { 402 /// Type of the global variable. 403 int64_t VarSize; 404 GlobalValue::LinkageTypes Linkage; 405 const std::string VarName; 406 407 public: OffloadEntryInfoDeviceGlobalVar()408 OffloadEntryInfoDeviceGlobalVar() 409 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {} OffloadEntryInfoDeviceGlobalVar(unsigned Order,OMPTargetGlobalVarEntryKind Flags)410 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, 411 OMPTargetGlobalVarEntryKind Flags) 412 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {} OffloadEntryInfoDeviceGlobalVar(unsigned Order,Constant * Addr,int64_t VarSize,OMPTargetGlobalVarEntryKind Flags,GlobalValue::LinkageTypes Linkage,const std::string & VarName)413 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, 414 int64_t VarSize, 415 OMPTargetGlobalVarEntryKind Flags, 416 GlobalValue::LinkageTypes Linkage, 417 const std::string &VarName) 418 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags), 419 VarSize(VarSize), Linkage(Linkage), VarName(VarName) { 420 setAddress(Addr); 421 } 422 getVarSize()423 int64_t getVarSize() const { return VarSize; } getVarName()424 StringRef getVarName() const { return VarName; } setVarSize(int64_t Size)425 void setVarSize(int64_t Size) { VarSize = Size; } getLinkage()426 GlobalValue::LinkageTypes getLinkage() const { return Linkage; } setLinkage(GlobalValue::LinkageTypes LT)427 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; } classof(const OffloadEntryInfo * Info)428 static bool classof(const OffloadEntryInfo *Info) { 429 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar; 430 } 431 }; 432 433 /// Initialize device global variable entry. 434 /// This is ONLY used for DEVICE compilation. 435 LLVM_ABI void initializeDeviceGlobalVarEntryInfo( 436 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order); 437 438 /// Register device global variable entry. 439 LLVM_ABI void registerDeviceGlobalVarEntryInfo( 440 StringRef VarName, Constant *Addr, int64_t VarSize, 441 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage); 442 /// Checks if the variable with the given name has been registered already. hasDeviceGlobalVarEntryInfo(StringRef VarName)443 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const { 444 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0; 445 } 446 /// Applies action \a Action on all registered entries. 447 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> 448 OffloadDeviceGlobalVarEntryInfoActTy; 449 LLVM_ABI void actOnDeviceGlobalVarEntriesInfo( 450 const OffloadDeviceGlobalVarEntryInfoActTy &Action); 451 452 private: 453 /// Return the count of entries at a particular source location. 454 unsigned 455 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const; 456 457 /// Update the count of entries at a particular source location. 458 void 459 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo); 460 461 static TargetRegionEntryInfo getTargetRegionEntryCountKey(const TargetRegionEntryInfo & EntryInfo)462 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) { 463 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID, 464 EntryInfo.FileID, EntryInfo.Line, 0); 465 } 466 467 // Count of entries at a location. 468 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount; 469 470 // Storage for target region entries kind. 471 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion> 472 OffloadEntriesTargetRegionTy; 473 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion; 474 /// Storage for device global variable entries kind. The storage is to be 475 /// indexed by mangled name. 476 typedef StringMap<OffloadEntryInfoDeviceGlobalVar> 477 OffloadEntriesDeviceGlobalVarTy; 478 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar; 479 }; 480 481 /// An interface to create LLVM-IR for OpenMP directives. 482 /// 483 /// Each OpenMP directive has a corresponding public generator method. 484 class OpenMPIRBuilder { 485 public: 486 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will 487 /// not have an effect on \p M (see initialize) OpenMPIRBuilder(Module & M)488 OpenMPIRBuilder(Module &M) 489 : M(M), Builder(M.getContext()), OffloadInfoManager(this), 490 T(M.getTargetTriple()), IsFinalized(false) {} 491 LLVM_ABI ~OpenMPIRBuilder(); 492 493 class AtomicInfo : public llvm::AtomicInfo { 494 llvm::Value *AtomicVar; 495 496 public: AtomicInfo(IRBuilder<> * Builder,llvm::Type * Ty,uint64_t AtomicSizeInBits,uint64_t ValueSizeInBits,llvm::Align AtomicAlign,llvm::Align ValueAlign,bool UseLibcall,IRBuilderBase::InsertPoint AllocaIP,llvm::Value * AtomicVar)497 AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits, 498 uint64_t ValueSizeInBits, llvm::Align AtomicAlign, 499 llvm::Align ValueAlign, bool UseLibcall, 500 IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar) 501 : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits, 502 AtomicAlign, ValueAlign, UseLibcall, AllocaIP), 503 AtomicVar(AtomicVar) {} 504 getAtomicPointer()505 llvm::Value *getAtomicPointer() const override { return AtomicVar; } decorateWithTBAA(llvm::Instruction * I)506 void decorateWithTBAA(llvm::Instruction *I) override {} CreateAlloca(llvm::Type * Ty,const llvm::Twine & Name)507 llvm::AllocaInst *CreateAlloca(llvm::Type *Ty, 508 const llvm::Twine &Name) const override { 509 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty); 510 allocaInst->setName(Name); 511 return allocaInst; 512 } 513 }; 514 /// Initialize the internal state, this will put structures types and 515 /// potentially other helpers into the underlying module. Must be called 516 /// before any other method and only once! This internal state includes types 517 /// used in the OpenMPIRBuilder generated from OMPKinds.def. 518 LLVM_ABI void initialize(); 519 setConfig(OpenMPIRBuilderConfig C)520 void setConfig(OpenMPIRBuilderConfig C) { Config = C; } 521 522 /// Finalize the underlying module, e.g., by outlining regions. 523 /// \param Fn The function to be finalized. If not used, 524 /// all functions are finalized. 525 LLVM_ABI void finalize(Function *Fn = nullptr); 526 527 /// Check whether the finalize function has already run 528 /// \return true if the finalize function has already run 529 LLVM_ABI bool isFinalized(); 530 531 /// Add attributes known for \p FnID to \p Fn. 532 LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn); 533 534 /// Type used throughout for insertion points. 535 using InsertPointTy = IRBuilder<>::InsertPoint; 536 537 /// Type used to represent an insertion point or an error value. 538 using InsertPointOrErrorTy = Expected<InsertPointTy>; 539 540 /// Get the create a name using the platform specific separators. 541 /// \param Parts parts of the final name that needs separation 542 /// The created name has a first separator between the first and second part 543 /// and a second separator between all other parts. 544 /// E.g. with FirstSeparator "$" and Separator "." and 545 /// parts: "p1", "p2", "p3", "p4" 546 /// The resulting name is "p1$p2.p3.p4" 547 /// The separators are retrieved from the OpenMPIRBuilderConfig. 548 LLVM_ABI std::string 549 createPlatformSpecificName(ArrayRef<StringRef> Parts) const; 550 551 /// Callback type for variable finalization (think destructors). 552 /// 553 /// \param CodeGenIP is the insertion point at which the finalization code 554 /// should be placed. 555 /// 556 /// A finalize callback knows about all objects that need finalization, e.g. 557 /// destruction, when the scope of the currently generated construct is left 558 /// at the time, and location, the callback is invoked. 559 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>; 560 561 struct FinalizationInfo { 562 /// The finalization callback provided by the last in-flight invocation of 563 /// createXXXX for the directive of kind DK. 564 FinalizeCallbackTy FiniCB; 565 566 /// The directive kind of the innermost directive that has an associated 567 /// region which might require finalization when it is left. 568 omp::Directive DK; 569 570 /// Flag to indicate if the directive is cancellable. 571 bool IsCancellable; 572 }; 573 574 /// Push a finalization callback on the finalization stack. 575 /// 576 /// NOTE: Temporary solution until Clang CG is gone. pushFinalizationCB(const FinalizationInfo & FI)577 void pushFinalizationCB(const FinalizationInfo &FI) { 578 FinalizationStack.push_back(FI); 579 } 580 581 /// Pop the last finalization callback from the finalization stack. 582 /// 583 /// NOTE: Temporary solution until Clang CG is gone. popFinalizationCB()584 void popFinalizationCB() { FinalizationStack.pop_back(); } 585 586 /// Callback type for body (=inner region) code generation 587 /// 588 /// The callback takes code locations as arguments, each describing a 589 /// location where additional instructions can be inserted. 590 /// 591 /// The CodeGenIP may be in the middle of a basic block or point to the end of 592 /// it. The basic block may have a terminator or be degenerate. The callback 593 /// function may just insert instructions at that position, but also split the 594 /// block (without the Before argument of BasicBlock::splitBasicBlock such 595 /// that the identify of the split predecessor block is preserved) and insert 596 /// additional control flow, including branches that do not lead back to what 597 /// follows the CodeGenIP. Note that since the callback is allowed to split 598 /// the block, callers must assume that InsertPoints to positions in the 599 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If 600 /// such InsertPoints need to be preserved, it can split the block itself 601 /// before calling the callback. 602 /// 603 /// AllocaIP and CodeGenIP must not point to the same position. 604 /// 605 /// \param AllocaIP is the insertion point at which new alloca instructions 606 /// should be placed. The BasicBlock it is pointing to must 607 /// not be split. 608 /// \param CodeGenIP is the insertion point at which the body code should be 609 /// placed. 610 /// 611 /// \return an error, if any were triggered during execution. 612 using BodyGenCallbackTy = 613 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 614 615 // This is created primarily for sections construct as llvm::function_ref 616 // (BodyGenCallbackTy) is not storable (as described in the comments of 617 // function_ref class - function_ref contains non-ownable reference 618 // to the callable. 619 /// 620 /// \return an error, if any were triggered during execution. 621 using StorableBodyGenCallbackTy = 622 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 623 624 /// Callback type for loop body code generation. 625 /// 626 /// \param CodeGenIP is the insertion point where the loop's body code must be 627 /// placed. This will be a dedicated BasicBlock with a 628 /// conditional branch from the loop condition check and 629 /// terminated with an unconditional branch to the loop 630 /// latch. 631 /// \param IndVar is the induction variable usable at the insertion point. 632 /// 633 /// \return an error, if any were triggered during execution. 634 using LoopBodyGenCallbackTy = 635 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>; 636 637 /// Callback type for variable privatization (think copy & default 638 /// constructor). 639 /// 640 /// \param AllocaIP is the insertion point at which new alloca instructions 641 /// should be placed. 642 /// \param CodeGenIP is the insertion point at which the privatization code 643 /// should be placed. 644 /// \param Original The value being copied/created, should not be used in the 645 /// generated IR. 646 /// \param Inner The equivalent of \p Original that should be used in the 647 /// generated IR; this is equal to \p Original if the value is 648 /// a pointer and can thus be passed directly, otherwise it is 649 /// an equivalent but different value. 650 /// \param ReplVal The replacement value, thus a copy or new created version 651 /// of \p Inner. 652 /// 653 /// \returns The new insertion point where code generation continues and 654 /// \p ReplVal the replacement value. 655 using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy( 656 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, 657 Value &Inner, Value *&ReplVal)>; 658 659 /// Description of a LLVM-IR insertion point (IP) and a debug/source location 660 /// (filename, line, column, ...). 661 struct LocationDescription { LocationDescriptionLocationDescription662 LocationDescription(const IRBuilderBase &IRB) 663 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {} LocationDescriptionLocationDescription664 LocationDescription(const InsertPointTy &IP) : IP(IP) {} LocationDescriptionLocationDescription665 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL) 666 : IP(IP), DL(DL) {} 667 InsertPointTy IP; 668 DebugLoc DL; 669 }; 670 671 /// Emitter methods for OpenMP directives. 672 /// 673 ///{ 674 675 /// Generator for '#omp barrier' 676 /// 677 /// \param Loc The location where the barrier directive was encountered. 678 /// \param Kind The kind of directive that caused the barrier. 679 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier. 680 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value 681 /// should be checked and acted upon. 682 /// \param ThreadID Optional parameter to pass in any existing ThreadID value. 683 /// 684 /// \returns The insertion point after the barrier. 685 LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, 686 omp::Directive Kind, 687 bool ForceSimpleCall = false, 688 bool CheckCancelFlag = true); 689 690 /// Generator for '#omp cancel' 691 /// 692 /// \param Loc The location where the directive was encountered. 693 /// \param IfCondition The evaluated 'if' clause expression, if any. 694 /// \param CanceledDirective The kind of directive that is cancled. 695 /// 696 /// \returns The insertion point after the barrier. 697 LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, 698 Value *IfCondition, 699 omp::Directive CanceledDirective); 700 701 /// Generator for '#omp cancellation point' 702 /// 703 /// \param Loc The location where the directive was encountered. 704 /// \param CanceledDirective The kind of directive that is cancled. 705 /// 706 /// \returns The insertion point after the barrier. 707 LLVM_ABI InsertPointOrErrorTy createCancellationPoint( 708 const LocationDescription &Loc, omp::Directive CanceledDirective); 709 710 /// Generator for '#omp parallel' 711 /// 712 /// \param Loc The insert and source location description. 713 /// \param AllocaIP The insertion points to be used for alloca instructions. 714 /// \param BodyGenCB Callback that will generate the region code. 715 /// \param PrivCB Callback to copy a given variable (think copy constructor). 716 /// \param FiniCB Callback to finalize variable copies. 717 /// \param IfCondition The evaluated 'if' clause expression, if any. 718 /// \param NumThreads The evaluated 'num_threads' clause expression, if any. 719 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind). 720 /// \param IsCancellable Flag to indicate a cancellable parallel region. 721 /// 722 /// \returns The insertion position *after* the parallel. 723 LLVM_ABI InsertPointOrErrorTy createParallel( 724 const LocationDescription &Loc, InsertPointTy AllocaIP, 725 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 726 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, 727 omp::ProcBindKind ProcBind, bool IsCancellable); 728 729 /// Generator for the control flow structure of an OpenMP canonical loop. 730 /// 731 /// This generator operates on the logical iteration space of the loop, i.e. 732 /// the caller only has to provide a loop trip count of the loop as defined by 733 /// base language semantics. The trip count is interpreted as an unsigned 734 /// integer. The induction variable passed to \p BodyGenCB will be of the same 735 /// type and run from 0 to \p TripCount - 1. It is up to the callback to 736 /// convert the logical iteration variable to the loop counter variable in the 737 /// loop body. 738 /// 739 /// \param Loc The insert and source location description. The insert 740 /// location can be between two instructions or the end of a 741 /// degenerate block (e.g. a BB under construction). 742 /// \param BodyGenCB Callback that will generate the loop body code. 743 /// \param TripCount Number of iterations the loop body is executed. 744 /// \param Name Base name used to derive BB and instruction names. 745 /// 746 /// \returns An object representing the created control flow structure which 747 /// can be used for loop-associated directives. 748 LLVM_ABI Expected<CanonicalLoopInfo *> 749 createCanonicalLoop(const LocationDescription &Loc, 750 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, 751 const Twine &Name = "loop"); 752 753 /// Calculate the trip count of a canonical loop. 754 /// 755 /// This allows specifying user-defined loop counter values using increment, 756 /// upper- and lower bounds. To disambiguate the terminology when counting 757 /// downwards, instead of lower bounds we use \p Start for the loop counter 758 /// value in the first body iteration. 759 /// 760 /// Consider the following limitations: 761 /// 762 /// * A loop counter space over all integer values of its bit-width cannot be 763 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be 764 /// stored into an 8 bit integer): 765 /// 766 /// DO I = 0, 255, 1 767 /// 768 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g. 769 /// effectively counting downwards: 770 /// 771 /// for (uint8_t i = 100u; i > 0; i += 127u) 772 /// 773 /// 774 /// TODO: May need to add additional parameters to represent: 775 /// 776 /// * Allow representing downcounting with unsigned integers. 777 /// 778 /// * Sign of the step and the comparison operator might disagree: 779 /// 780 /// for (int i = 0; i < 42; i -= 1u) 781 /// 782 /// \param Loc The insert and source location description. 783 /// \param Start Value of the loop counter for the first iterations. 784 /// \param Stop Loop counter values past this will stop the loop. 785 /// \param Step Loop counter increment after each iteration; negative 786 /// means counting down. 787 /// \param IsSigned Whether Start, Stop and Step are signed integers. 788 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop 789 /// counter. 790 /// \param Name Base name used to derive instruction names. 791 /// 792 /// \returns The value holding the calculated trip count. 793 LLVM_ABI Value *calculateCanonicalLoopTripCount( 794 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, 795 bool IsSigned, bool InclusiveStop, const Twine &Name = "loop"); 796 797 /// Generator for the control flow structure of an OpenMP canonical loop. 798 /// 799 /// Instead of a logical iteration space, this allows specifying user-defined 800 /// loop counter values using increment, upper- and lower bounds. To 801 /// disambiguate the terminology when counting downwards, instead of lower 802 /// bounds we use \p Start for the loop counter value in the first body 803 /// 804 /// It calls \see calculateCanonicalLoopTripCount for trip count calculations, 805 /// so limitations of that method apply here as well. 806 /// 807 /// \param Loc The insert and source location description. 808 /// \param BodyGenCB Callback that will generate the loop body code. 809 /// \param Start Value of the loop counter for the first iterations. 810 /// \param Stop Loop counter values past this will stop the loop. 811 /// \param Step Loop counter increment after each iteration; negative 812 /// means counting down. 813 /// \param IsSigned Whether Start, Stop and Step are signed integers. 814 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop 815 /// counter. 816 /// \param ComputeIP Insertion point for instructions computing the trip 817 /// count. Can be used to ensure the trip count is available 818 /// at the outermost loop of a loop nest. If not set, 819 /// defaults to the preheader of the generated loop. 820 /// \param Name Base name used to derive BB and instruction names. 821 /// 822 /// \returns An object representing the created control flow structure which 823 /// can be used for loop-associated directives. 824 LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop( 825 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, 826 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, 827 InsertPointTy ComputeIP = {}, const Twine &Name = "loop"); 828 829 /// Collapse a loop nest into a single loop. 830 /// 831 /// Merges loops of a loop nest into a single CanonicalLoopNest representation 832 /// that has the same number of innermost loop iterations as the origin loop 833 /// nest. The induction variables of the input loops are derived from the 834 /// collapsed loop's induction variable. This is intended to be used to 835 /// implement OpenMP's collapse clause. Before applying a directive, 836 /// collapseLoops normalizes a loop nest to contain only a single loop and the 837 /// directive's implementation does not need to handle multiple loops itself. 838 /// This does not remove the need to handle all loop nest handling by 839 /// directives, such as the ordered(<n>) clause or the simd schedule-clause 840 /// modifier of the worksharing-loop directive. 841 /// 842 /// Example: 843 /// \code 844 /// for (int i = 0; i < 7; ++i) // Canonical loop "i" 845 /// for (int j = 0; j < 9; ++j) // Canonical loop "j" 846 /// body(i, j); 847 /// \endcode 848 /// 849 /// After collapsing with Loops={i,j}, the loop is changed to 850 /// \code 851 /// for (int ij = 0; ij < 63; ++ij) { 852 /// int i = ij / 9; 853 /// int j = ij % 9; 854 /// body(i, j); 855 /// } 856 /// \endcode 857 /// 858 /// In the current implementation, the following limitations apply: 859 /// 860 /// * All input loops have an induction variable of the same type. 861 /// 862 /// * The collapsed loop will have the same trip count integer type as the 863 /// input loops. Therefore it is possible that the collapsed loop cannot 864 /// represent all iterations of the input loops. For instance, assuming a 865 /// 32 bit integer type, and two input loops both iterating 2^16 times, the 866 /// theoretical trip count of the collapsed loop would be 2^32 iteration, 867 /// which cannot be represented in an 32-bit integer. Behavior is undefined 868 /// in this case. 869 /// 870 /// * The trip counts of every input loop must be available at \p ComputeIP. 871 /// Non-rectangular loops are not yet supported. 872 /// 873 /// * At each nest level, code between a surrounding loop and its nested loop 874 /// is hoisted into the loop body, and such code will be executed more 875 /// often than before collapsing (or not at all if any inner loop iteration 876 /// has a trip count of 0). This is permitted by the OpenMP specification. 877 /// 878 /// \param DL Debug location for instructions added for collapsing, 879 /// such as instructions to compute/derive the input loop's 880 /// induction variables. 881 /// \param Loops Loops in the loop nest to collapse. Loops are specified 882 /// from outermost-to-innermost and every control flow of a 883 /// loop's body must pass through its directly nested loop. 884 /// \param ComputeIP Where additional instruction that compute the collapsed 885 /// trip count. If not set, defaults to before the generated 886 /// loop. 887 /// 888 /// \returns The CanonicalLoopInfo object representing the collapsed loop. 889 LLVM_ABI CanonicalLoopInfo *collapseLoops(DebugLoc DL, 890 ArrayRef<CanonicalLoopInfo *> Loops, 891 InsertPointTy ComputeIP); 892 893 /// Get the default alignment value for given target 894 /// 895 /// \param TargetTriple Target triple 896 /// \param Features StringMap which describes extra CPU features 897 LLVM_ABI static unsigned 898 getOpenMPDefaultSimdAlign(const Triple &TargetTriple, 899 const StringMap<bool> &Features); 900 901 /// Retrieve (or create if non-existent) the address of a declare 902 /// target variable, used in conjunction with registerTargetGlobalVariable 903 /// to create declare target global variables. 904 /// 905 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 906 /// clause used in conjunction with the variable being registered (link, 907 /// to, enter). 908 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 909 /// clause used in conjunction with the variable being registered (nohost, 910 /// host, any) 911 /// \param IsDeclaration - boolean stating if the variable being registered 912 /// is a declaration-only and not a definition 913 /// \param IsExternallyVisible - boolean stating if the variable is externally 914 /// visible 915 /// \param EntryInfo - Unique entry information for the value generated 916 /// using getTargetEntryUniqueInfo, used to name generated pointer references 917 /// to the declare target variable 918 /// \param MangledName - the mangled name of the variable being registered 919 /// \param GeneratedRefs - references generated by invocations of 920 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar, 921 /// these are required by Clang for book keeping. 922 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 923 /// \param TargetTriple - The OpenMP device target triple we are compiling 924 /// for 925 /// \param LlvmPtrTy - The type of the variable we are generating or 926 /// retrieving an address for 927 /// \param GlobalInitializer - a lambda function which creates a constant 928 /// used for initializing a pointer reference to the variable in certain 929 /// cases. If a nullptr is passed, it will default to utilising the original 930 /// variable to initialize the pointer reference. 931 /// \param VariableLinkage - a lambda function which returns the variables 932 /// linkage type, if unspecified and a nullptr is given, it will instead 933 /// utilise the linkage stored on the existing global variable in the 934 /// LLVMModule. 935 LLVM_ABI Constant *getAddrOfDeclareTargetVar( 936 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 937 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 938 bool IsDeclaration, bool IsExternallyVisible, 939 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 940 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 941 std::vector<Triple> TargetTriple, Type *LlvmPtrTy, 942 std::function<Constant *()> GlobalInitializer, 943 std::function<GlobalValue::LinkageTypes()> VariableLinkage); 944 945 /// Registers a target variable for device or host. 946 /// 947 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 948 /// clause used in conjunction with the variable being registered (link, 949 /// to, enter). 950 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 951 /// clause used in conjunction with the variable being registered (nohost, 952 /// host, any) 953 /// \param IsDeclaration - boolean stating if the variable being registered 954 /// is a declaration-only and not a definition 955 /// \param IsExternallyVisible - boolean stating if the variable is externally 956 /// visible 957 /// \param EntryInfo - Unique entry information for the value generated 958 /// using getTargetEntryUniqueInfo, used to name generated pointer references 959 /// to the declare target variable 960 /// \param MangledName - the mangled name of the variable being registered 961 /// \param GeneratedRefs - references generated by invocations of 962 /// registerTargetGlobalVariable these are required by Clang for book 963 /// keeping. 964 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 965 /// \param TargetTriple - The OpenMP device target triple we are compiling 966 /// for 967 /// \param GlobalInitializer - a lambda function which creates a constant 968 /// used for initializing a pointer reference to the variable in certain 969 /// cases. If a nullptr is passed, it will default to utilising the original 970 /// variable to initialize the pointer reference. 971 /// \param VariableLinkage - a lambda function which returns the variables 972 /// linkage type, if unspecified and a nullptr is given, it will instead 973 /// utilise the linkage stored on the existing global variable in the 974 /// LLVMModule. 975 /// \param LlvmPtrTy - The type of the variable we are generating or 976 /// retrieving an address for 977 /// \param Addr - the original llvm value (addr) of the variable to be 978 /// registered 979 LLVM_ABI void registerTargetGlobalVariable( 980 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 981 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 982 bool IsDeclaration, bool IsExternallyVisible, 983 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 984 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 985 std::vector<Triple> TargetTriple, 986 std::function<Constant *()> GlobalInitializer, 987 std::function<GlobalValue::LinkageTypes()> VariableLinkage, 988 Type *LlvmPtrTy, Constant *Addr); 989 990 /// Get the offset of the OMP_MAP_MEMBER_OF field. 991 LLVM_ABI unsigned getFlagMemberOffset(); 992 993 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on 994 /// the position given. 995 /// \param Position - A value indicating the position of the parent 996 /// of the member in the kernel argument structure, often retrieved 997 /// by the parents position in the combined information vectors used 998 /// to generate the structure itself. Multiple children (member's of) 999 /// with the same parent will use the same returned member flag. 1000 LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position); 1001 1002 /// Given an initial flag set, this function modifies it to contain 1003 /// the passed in MemberOfFlag generated from the getMemberOfFlag 1004 /// function. The results are dependent on the existing flag bits 1005 /// set in the original flag set. 1006 /// \param Flags - The original set of flags to be modified with the 1007 /// passed in MemberOfFlag. 1008 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted 1009 /// slightly based on the getMemberOfFlag which adjusts the flag bits 1010 /// based on the members position in its parent. 1011 LLVM_ABI void 1012 setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, 1013 omp::OpenMPOffloadMappingFlags MemberOfFlag); 1014 1015 private: 1016 /// Modifies the canonical loop to be a statically-scheduled workshare loop 1017 /// which is executed on the device 1018 /// 1019 /// This takes a \p CLI representing a canonical loop, such as the one 1020 /// created by \see createCanonicalLoop and emits additional instructions to 1021 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1022 /// runtime function in the preheader to call OpenMP device rtl function 1023 /// which handles worksharing of loop body interations. 1024 /// 1025 /// \param DL Debug location for instructions added for the 1026 /// workshare-loop construct itself. 1027 /// \param CLI A descriptor of the canonical loop to workshare. 1028 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1029 /// preheader of the loop. 1030 /// \param LoopType Information about type of loop worksharing. 1031 /// It corresponds to type of loop workshare OpenMP pragma. 1032 /// 1033 /// \returns Point where to insert code after the workshare construct. 1034 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, 1035 InsertPointTy AllocaIP, 1036 omp::WorksharingLoopType LoopType); 1037 1038 /// Modifies the canonical loop to be a statically-scheduled workshare loop. 1039 /// 1040 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1041 /// created by \p createCanonicalLoop and emits additional instructions to 1042 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1043 /// runtime function in the preheader to obtain the loop bounds to be used in 1044 /// the current thread, updates the relevant instructions in the canonical 1045 /// loop and calls to an OpenMP runtime finalization function after the loop. 1046 /// 1047 /// \param DL Debug location for instructions added for the 1048 /// workshare-loop construct itself. 1049 /// \param CLI A descriptor of the canonical loop to workshare. 1050 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1051 /// preheader of the loop. 1052 /// \param NeedsBarrier Indicates whether a barrier must be inserted after 1053 /// the loop. 1054 /// \param LoopType Type of workshare loop. 1055 /// 1056 /// \returns Point where to insert code after the workshare construct. 1057 InsertPointOrErrorTy applyStaticWorkshareLoop( 1058 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 1059 omp::WorksharingLoopType LoopType, bool NeedsBarrier); 1060 1061 /// Modifies the canonical loop a statically-scheduled workshare loop with a 1062 /// user-specified chunk size. 1063 /// 1064 /// \param DL Debug location for instructions added for the 1065 /// workshare-loop construct itself. 1066 /// \param CLI A descriptor of the canonical loop to workshare. 1067 /// \param AllocaIP An insertion point for Alloca instructions usable in 1068 /// the preheader of the loop. 1069 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the 1070 /// loop. 1071 /// \param ChunkSize The user-specified chunk size. 1072 /// 1073 /// \returns Point where to insert code after the workshare construct. 1074 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(DebugLoc DL, 1075 CanonicalLoopInfo *CLI, 1076 InsertPointTy AllocaIP, 1077 bool NeedsBarrier, 1078 Value *ChunkSize); 1079 1080 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. 1081 /// 1082 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1083 /// created by \p createCanonicalLoop and emits additional instructions to 1084 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1085 /// runtime function in the preheader to obtain, and then in each iteration 1086 /// to update the loop counter. 1087 /// 1088 /// \param DL Debug location for instructions added for the 1089 /// workshare-loop construct itself. 1090 /// \param CLI A descriptor of the canonical loop to workshare. 1091 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1092 /// preheader of the loop. 1093 /// \param SchedType Type of scheduling to be passed to the init function. 1094 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1095 /// the loop. 1096 /// \param Chunk The size of loop chunk considered as a unit when 1097 /// scheduling. If \p nullptr, defaults to 1. 1098 /// 1099 /// \returns Point where to insert code after the workshare construct. 1100 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL, 1101 CanonicalLoopInfo *CLI, 1102 InsertPointTy AllocaIP, 1103 omp::OMPScheduleType SchedType, 1104 bool NeedsBarrier, 1105 Value *Chunk = nullptr); 1106 1107 /// Create alternative version of the loop to support if clause 1108 /// 1109 /// OpenMP if clause can require to generate second loop. This loop 1110 /// will be executed when if clause condition is not met. createIfVersion 1111 /// adds branch instruction to the copied loop if \p ifCond is not met. 1112 /// 1113 /// \param Loop Original loop which should be versioned. 1114 /// \param IfCond Value which corresponds to if clause condition 1115 /// \param VMap Value to value map to define relation between 1116 /// original and copied loop values and loop blocks. 1117 /// \param NamePrefix Optional name prefix for if.then if.else blocks. 1118 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond, 1119 ValueMap<const Value *, WeakTrackingVH> &VMap, 1120 LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L, 1121 const Twine &NamePrefix = ""); 1122 1123 public: 1124 /// Modifies the canonical loop to be a workshare loop. 1125 /// 1126 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1127 /// created by \p createCanonicalLoop and emits additional instructions to 1128 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1129 /// runtime function in the preheader to obtain the loop bounds to be used in 1130 /// the current thread, updates the relevant instructions in the canonical 1131 /// loop and calls to an OpenMP runtime finalization function after the loop. 1132 /// 1133 /// The concrete transformation is done by applyStaticWorkshareLoop, 1134 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending 1135 /// on the value of \p SchedKind and \p ChunkSize. 1136 /// 1137 /// \param DL Debug location for instructions added for the 1138 /// workshare-loop construct itself. 1139 /// \param CLI A descriptor of the canonical loop to workshare. 1140 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1141 /// preheader of the loop. 1142 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1143 /// the loop. 1144 /// \param SchedKind Scheduling algorithm to use. 1145 /// \param ChunkSize The chunk size for the inner loop. 1146 /// \param HasSimdModifier Whether the simd modifier is present in the 1147 /// schedule clause. 1148 /// \param HasMonotonicModifier Whether the monotonic modifier is present in 1149 /// the schedule clause. 1150 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is 1151 /// present in the schedule clause. 1152 /// \param HasOrderedClause Whether the (parameterless) ordered clause is 1153 /// present. 1154 /// \param LoopType Information about type of loop worksharing. 1155 /// It corresponds to type of loop workshare OpenMP pragma. 1156 /// 1157 /// \returns Point where to insert code after the workshare construct. 1158 LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop( 1159 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 1160 bool NeedsBarrier, 1161 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default, 1162 Value *ChunkSize = nullptr, bool HasSimdModifier = false, 1163 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false, 1164 bool HasOrderedClause = false, 1165 omp::WorksharingLoopType LoopType = 1166 omp::WorksharingLoopType::ForStaticLoop); 1167 1168 /// Tile a loop nest. 1169 /// 1170 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in 1171 /// \p/ Loops must be perfectly nested, from outermost to innermost loop 1172 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value 1173 /// of every loop and every tile sizes must be usable in the outermost 1174 /// loop's preheader. This implies that the loop nest is rectangular. 1175 /// 1176 /// Example: 1177 /// \code 1178 /// for (int i = 0; i < 15; ++i) // Canonical loop "i" 1179 /// for (int j = 0; j < 14; ++j) // Canonical loop "j" 1180 /// body(i, j); 1181 /// \endcode 1182 /// 1183 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to 1184 /// \code 1185 /// for (int i1 = 0; i1 < 3; ++i1) 1186 /// for (int j1 = 0; j1 < 2; ++j1) 1187 /// for (int i2 = 0; i2 < 5; ++i2) 1188 /// for (int j2 = 0; j2 < 7; ++j2) 1189 /// body(i1*3+i2, j1*3+j2); 1190 /// \endcode 1191 /// 1192 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are 1193 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also 1194 /// handles non-constant trip counts, non-constant tile sizes and trip counts 1195 /// that are not multiples of the tile size. In the latter case the tile loop 1196 /// of the last floor-loop iteration will have fewer iterations than specified 1197 /// as its tile size. 1198 /// 1199 /// 1200 /// @param DL Debug location for instructions added by tiling, for 1201 /// instance the floor- and tile trip count computation. 1202 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are 1203 /// invalidated by this method, i.e. should not used after 1204 /// tiling. 1205 /// @param TileSizes For each loop in \p Loops, the tile size for that 1206 /// dimensions. 1207 /// 1208 /// \returns A list of generated loops. Contains twice as many loops as the 1209 /// input loop nest; the first half are the floor loops and the 1210 /// second half are the tile loops. 1211 LLVM_ABI std::vector<CanonicalLoopInfo *> 1212 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 1213 ArrayRef<Value *> TileSizes); 1214 1215 /// Fully unroll a loop. 1216 /// 1217 /// Instead of unrolling the loop immediately (and duplicating its body 1218 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop 1219 /// metadata. 1220 /// 1221 /// \param DL Debug location for instructions added by unrolling. 1222 /// \param Loop The loop to unroll. The loop will be invalidated. 1223 LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop); 1224 1225 /// Fully or partially unroll a loop. How the loop is unrolled is determined 1226 /// using LLVM's LoopUnrollPass. 1227 /// 1228 /// \param DL Debug location for instructions added by unrolling. 1229 /// \param Loop The loop to unroll. The loop will be invalidated. 1230 LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop); 1231 1232 /// Partially unroll a loop. 1233 /// 1234 /// The CanonicalLoopInfo of the unrolled loop for use with chained 1235 /// loop-associated directive can be requested using \p UnrolledCLI. Not 1236 /// needing the CanonicalLoopInfo allows more efficient code generation by 1237 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata. 1238 /// A loop-associated directive applied to the unrolled loop needs to know the 1239 /// new trip count which means that if using a heuristically determined unroll 1240 /// factor (\p Factor == 0), that factor must be computed immediately. We are 1241 /// using the same logic as the LoopUnrollPass to derived the unroll factor, 1242 /// but which assumes that some canonicalization has taken place (e.g. 1243 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform 1244 /// better when the unrolled loop's CanonicalLoopInfo is not needed. 1245 /// 1246 /// \param DL Debug location for instructions added by unrolling. 1247 /// \param Loop The loop to unroll. The loop will be invalidated. 1248 /// \param Factor The factor to unroll the loop by. A factor of 0 1249 /// indicates that a heuristic should be used to determine 1250 /// the unroll-factor. 1251 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the 1252 /// partially unrolled loop. Otherwise, uses loop metadata 1253 /// to defer unrolling to the LoopUnrollPass. 1254 LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, 1255 int32_t Factor, 1256 CanonicalLoopInfo **UnrolledCLI); 1257 1258 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop 1259 /// is cloned. The metadata which prevents vectorization is added to 1260 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated 1261 /// to false. 1262 /// 1263 /// \param Loop The loop to simd-ize. 1264 /// \param AlignedVars The map which containts pairs of the pointer 1265 /// and its corresponding alignment. 1266 /// \param IfCond The value which corresponds to the if clause 1267 /// condition. 1268 /// \param Order The enum to map order clause. 1269 /// \param Simdlen The Simdlen length to apply to the simd loop. 1270 /// \param Safelen The Safelen length to apply to the simd loop. 1271 LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, 1272 MapVector<Value *, Value *> AlignedVars, 1273 Value *IfCond, omp::OrderKind Order, 1274 ConstantInt *Simdlen, ConstantInt *Safelen); 1275 1276 /// Generator for '#omp flush' 1277 /// 1278 /// \param Loc The location where the flush directive was encountered 1279 LLVM_ABI void createFlush(const LocationDescription &Loc); 1280 1281 /// Generator for '#omp taskwait' 1282 /// 1283 /// \param Loc The location where the taskwait directive was encountered. 1284 LLVM_ABI void createTaskwait(const LocationDescription &Loc); 1285 1286 /// Generator for '#omp taskyield' 1287 /// 1288 /// \param Loc The location where the taskyield directive was encountered. 1289 LLVM_ABI void createTaskyield(const LocationDescription &Loc); 1290 1291 /// A struct to pack the relevant information for an OpenMP depend clause. 1292 struct DependData { 1293 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown; 1294 Type *DepValueType; 1295 Value *DepVal; 1296 explicit DependData() = default; DependDataDependData1297 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, 1298 Value *DepVal) 1299 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {} 1300 }; 1301 1302 /// Generator for `#omp task` 1303 /// 1304 /// \param Loc The location where the task construct was encountered. 1305 /// \param AllocaIP The insertion point to be used for alloca instructions. 1306 /// \param BodyGenCB Callback that will generate the region code. 1307 /// \param Tied True if the task is tied, false if the task is untied. 1308 /// \param Final i1 value which is `true` if the task is final, `false` if the 1309 /// task is not final. 1310 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred 1311 /// task is generated, and the encountering thread must 1312 /// suspend the current task region, for which execution 1313 /// cannot be resumed until execution of the structured 1314 /// block that is associated with the generated task is 1315 /// completed. 1316 /// \param EventHandle If present, signifies the event handle as part of 1317 /// the detach clause 1318 /// \param Mergeable If the given task is `mergeable` 1319 /// \param priority `priority-value' specifies the execution order of the 1320 /// tasks that is generated by the construct 1321 LLVM_ABI InsertPointOrErrorTy 1322 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, 1323 BodyGenCallbackTy BodyGenCB, bool Tied = true, 1324 Value *Final = nullptr, Value *IfCondition = nullptr, 1325 SmallVector<DependData> Dependencies = {}, bool Mergeable = false, 1326 Value *EventHandle = nullptr, Value *Priority = nullptr); 1327 1328 /// Generator for the taskgroup construct 1329 /// 1330 /// \param Loc The location where the taskgroup construct was encountered. 1331 /// \param AllocaIP The insertion point to be used for alloca instructions. 1332 /// \param BodyGenCB Callback that will generate the region code. 1333 LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, 1334 InsertPointTy AllocaIP, 1335 BodyGenCallbackTy BodyGenCB); 1336 1337 using FileIdentifierInfoCallbackTy = 1338 std::function<std::tuple<std::string, uint64_t>()>; 1339 1340 /// Creates a unique info for a target entry when provided a filename and 1341 /// line number from. 1342 /// 1343 /// \param CallBack A callback function which should return filename the entry 1344 /// resides in as well as the line number for the target entry 1345 /// \param ParentName The name of the parent the target entry resides in, if 1346 /// any. 1347 LLVM_ABI static TargetRegionEntryInfo 1348 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, 1349 StringRef ParentName = ""); 1350 1351 /// Enum class for the RedctionGen CallBack type to be used. 1352 enum class ReductionGenCBKind { Clang, MLIR }; 1353 1354 /// ReductionGen CallBack for Clang 1355 /// 1356 /// \param CodeGenIP InsertPoint for CodeGen. 1357 /// \param Index Index of the ReductionInfo to generate code for. 1358 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for 1359 /// codegen, used for fixup later. 1360 /// \param RHSPtr Optionally used by Clang to 1361 /// return the RHSPtr it used for codegen, used for fixup later. 1362 /// \param CurFn Optionally used by Clang to pass in the Current Function as 1363 /// Clang context may be old. 1364 using ReductionGenClangCBTy = 1365 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, 1366 Value **LHS, Value **RHS, Function *CurFn)>; 1367 1368 /// ReductionGen CallBack for MLIR 1369 /// 1370 /// \param CodeGenIP InsertPoint for CodeGen. 1371 /// \param LHS Pass in the LHS Value to be used for CodeGen. 1372 /// \param RHS Pass in the RHS Value to be used for CodeGen. 1373 using ReductionGenCBTy = std::function<InsertPointOrErrorTy( 1374 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>; 1375 1376 /// Functions used to generate atomic reductions. Such functions take two 1377 /// Values representing pointers to LHS and RHS of the reduction, as well as 1378 /// the element type of these pointers. They are expected to atomically 1379 /// update the LHS to the reduced value. 1380 using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy( 1381 InsertPointTy, Type *, Value *, Value *)>; 1382 1383 /// Enum class for reduction evaluation types scalar, complex and aggregate. 1384 enum class EvalKind { Scalar, Complex, Aggregate }; 1385 1386 /// Information about an OpenMP reduction. 1387 struct ReductionInfo { ReductionInfoReductionInfo1388 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, 1389 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, 1390 ReductionGenClangCBTy ReductionGenClang, 1391 ReductionGenAtomicCBTy AtomicReductionGen) 1392 : ElementType(ElementType), Variable(Variable), 1393 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), 1394 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), 1395 AtomicReductionGen(AtomicReductionGen) {} ReductionInfoReductionInfo1396 ReductionInfo(Value *PrivateVariable) 1397 : ElementType(nullptr), Variable(nullptr), 1398 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), 1399 ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} 1400 1401 /// Reduction element type, must match pointee type of variable. 1402 Type *ElementType; 1403 1404 /// Reduction variable of pointer type. 1405 Value *Variable; 1406 1407 /// Thread-private partial reduction variable. 1408 Value *PrivateVariable; 1409 1410 /// Reduction evaluation kind - scalar, complex or aggregate. 1411 EvalKind EvaluationKind; 1412 1413 /// Callback for generating the reduction body. The IR produced by this will 1414 /// be used to combine two values in a thread-safe context, e.g., under 1415 /// lock or within the same thread, and therefore need not be atomic. 1416 ReductionGenCBTy ReductionGen; 1417 1418 /// Clang callback for generating the reduction body. The IR produced by 1419 /// this will be used to combine two values in a thread-safe context, e.g., 1420 /// under lock or within the same thread, and therefore need not be atomic. 1421 ReductionGenClangCBTy ReductionGenClang; 1422 1423 /// Callback for generating the atomic reduction body, may be null. The IR 1424 /// produced by this will be used to atomically combine two values during 1425 /// reduction. If null, the implementation will use the non-atomic version 1426 /// along with the appropriate synchronization mechanisms. 1427 ReductionGenAtomicCBTy AtomicReductionGen; 1428 }; 1429 1430 enum class CopyAction : unsigned { 1431 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 1432 // the warp using shuffle instructions. 1433 RemoteLaneToThread, 1434 // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 1435 ThreadCopy, 1436 }; 1437 1438 struct CopyOptionsTy { 1439 Value *RemoteLaneOffset = nullptr; 1440 Value *ScratchpadIndex = nullptr; 1441 Value *ScratchpadWidth = nullptr; 1442 }; 1443 1444 /// Supporting functions for Reductions CodeGen. 1445 private: 1446 /// Get the id of the current thread on the GPU. 1447 Value *getGPUThreadID(); 1448 1449 /// Get the GPU warp size. 1450 Value *getGPUWarpSize(); 1451 1452 /// Get the id of the warp in the block. 1453 /// We assume that the warp size is 32, which is always the case 1454 /// on the NVPTX device, to generate more efficient code. 1455 Value *getNVPTXWarpID(); 1456 1457 /// Get the id of the current lane in the Warp. 1458 /// We assume that the warp size is 32, which is always the case 1459 /// on the NVPTX device, to generate more efficient code. 1460 Value *getNVPTXLaneID(); 1461 1462 /// Cast value to the specified type. 1463 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType); 1464 1465 /// This function creates calls to one of two shuffle functions to copy 1466 /// variables between lanes in a warp. 1467 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element, 1468 Type *ElementType, Value *Offset); 1469 1470 /// Function to shuffle over the value from the remote lane. 1471 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, 1472 Type *ElementType, Value *Offset, 1473 Type *ReductionArrayTy); 1474 1475 /// Emit instructions to copy a Reduce list, which contains partially 1476 /// aggregated values, in the specified direction. 1477 void emitReductionListCopy( 1478 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, 1479 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, 1480 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); 1481 1482 /// Emit a helper that reduces data across two OpenMP threads (lanes) 1483 /// in the same warp. It uses shuffle instructions to copy over data from 1484 /// a remote lane's stack. The reduction algorithm performed is specified 1485 /// by the fourth parameter. 1486 /// 1487 /// Algorithm Versions. 1488 /// Full Warp Reduce (argument value 0): 1489 /// This algorithm assumes that all 32 lanes are active and gathers 1490 /// data from these 32 lanes, producing a single resultant value. 1491 /// Contiguous Partial Warp Reduce (argument value 1): 1492 /// This algorithm assumes that only a *contiguous* subset of lanes 1493 /// are active. This happens for the last warp in a parallel region 1494 /// when the user specified num_threads is not an integer multiple of 1495 /// 32. This contiguous subset always starts with the zeroth lane. 1496 /// Partial Warp Reduce (argument value 2): 1497 /// This algorithm gathers data from any number of lanes at any position. 1498 /// All reduced values are stored in the lowest possible lane. The set 1499 /// of problems every algorithm addresses is a super set of those 1500 /// addressable by algorithms with a lower version number. Overhead 1501 /// increases as algorithm version increases. 1502 /// 1503 /// Terminology 1504 /// Reduce element: 1505 /// Reduce element refers to the individual data field with primitive 1506 /// data types to be combined and reduced across threads. 1507 /// Reduce list: 1508 /// Reduce list refers to a collection of local, thread-private 1509 /// reduce elements. 1510 /// Remote Reduce list: 1511 /// Remote Reduce list refers to a collection of remote (relative to 1512 /// the current thread) reduce elements. 1513 /// 1514 /// We distinguish between three states of threads that are important to 1515 /// the implementation of this function. 1516 /// Alive threads: 1517 /// Threads in a warp executing the SIMT instruction, as distinguished from 1518 /// threads that are inactive due to divergent control flow. 1519 /// Active threads: 1520 /// The minimal set of threads that has to be alive upon entry to this 1521 /// function. The computation is correct iff active threads are alive. 1522 /// Some threads are alive but they are not active because they do not 1523 /// contribute to the computation in any useful manner. Turning them off 1524 /// may introduce control flow overheads without any tangible benefits. 1525 /// Effective threads: 1526 /// In order to comply with the argument requirements of the shuffle 1527 /// function, we must keep all lanes holding data alive. But at most 1528 /// half of them perform value aggregation; we refer to this half of 1529 /// threads as effective. The other half is simply handing off their 1530 /// data. 1531 /// 1532 /// Procedure 1533 /// Value shuffle: 1534 /// In this step active threads transfer data from higher lane positions 1535 /// in the warp to lower lane positions, creating Remote Reduce list. 1536 /// Value aggregation: 1537 /// In this step, effective threads combine their thread local Reduce list 1538 /// with Remote Reduce list and store the result in the thread local 1539 /// Reduce list. 1540 /// Value copy: 1541 /// In this step, we deal with the assumption made by algorithm 2 1542 /// (i.e. contiguity assumption). When we have an odd number of lanes 1543 /// active, say 2k+1, only k threads will be effective and therefore k 1544 /// new values will be produced. However, the Reduce list owned by the 1545 /// (2k+1)th thread is ignored in the value aggregation. Therefore 1546 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 1547 /// that the contiguity assumption still holds. 1548 /// 1549 /// \param ReductionInfos Array type containing the ReductionOps. 1550 /// \param ReduceFn The reduction function. 1551 /// \param FuncAttrs Optional param to specify any function attributes that 1552 /// need to be copied to the new function. 1553 /// 1554 /// \return The ShuffleAndReduce function. 1555 Function *emitShuffleAndReduceFunction( 1556 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, 1557 Function *ReduceFn, AttributeList FuncAttrs); 1558 1559 /// This function emits a helper that gathers Reduce lists from the first 1560 /// lane of every active warp to lanes in the first warp. 1561 /// 1562 /// void inter_warp_copy_func(void* reduce_data, num_warps) 1563 /// shared smem[warp_size]; 1564 /// For all data entries D in reduce_data: 1565 /// sync 1566 /// If (I am the first lane in each warp) 1567 /// Copy my local D to smem[warp_id] 1568 /// sync 1569 /// if (I am the first warp) 1570 /// Copy smem[thread_id] to my local D 1571 /// 1572 /// \param Loc The insert and source location description. 1573 /// \param ReductionInfos Array type containing the ReductionOps. 1574 /// \param FuncAttrs Optional param to specify any function attributes that 1575 /// need to be copied to the new function. 1576 /// 1577 /// \return The InterWarpCopy function. 1578 Expected<Function *> 1579 emitInterWarpCopyFunction(const LocationDescription &Loc, 1580 ArrayRef<ReductionInfo> ReductionInfos, 1581 AttributeList FuncAttrs); 1582 1583 /// This function emits a helper that copies all the reduction variables from 1584 /// the team into the provided global buffer for the reduction variables. 1585 /// 1586 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1587 /// For all data entries D in reduce_data: 1588 /// Copy local D to buffer.D[Idx] 1589 /// 1590 /// \param ReductionInfos Array type containing the ReductionOps. 1591 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1592 /// \param FuncAttrs Optional param to specify any function attributes that 1593 /// need to be copied to the new function. 1594 /// 1595 /// \return The ListToGlobalCopy function. 1596 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1597 Type *ReductionsBufferTy, 1598 AttributeList FuncAttrs); 1599 1600 /// This function emits a helper that copies all the reduction variables from 1601 /// the team into the provided global buffer for the reduction variables. 1602 /// 1603 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1604 /// For all data entries D in reduce_data: 1605 /// Copy buffer.D[Idx] to local D; 1606 /// 1607 /// \param ReductionInfos Array type containing the ReductionOps. 1608 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1609 /// \param FuncAttrs Optional param to specify any function attributes that 1610 /// need to be copied to the new function. 1611 /// 1612 /// \return The GlobalToList function. 1613 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1614 Type *ReductionsBufferTy, 1615 AttributeList FuncAttrs); 1616 1617 /// This function emits a helper that reduces all the reduction variables from 1618 /// the team into the provided global buffer for the reduction variables. 1619 /// 1620 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 1621 /// void *GlobPtrs[]; 1622 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1623 /// ... 1624 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1625 /// reduce_function(GlobPtrs, reduce_data); 1626 /// 1627 /// \param ReductionInfos Array type containing the ReductionOps. 1628 /// \param ReduceFn The reduction function. 1629 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1630 /// \param FuncAttrs Optional param to specify any function attributes that 1631 /// need to be copied to the new function. 1632 /// 1633 /// \return The ListToGlobalReduce function. 1634 Function * 1635 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1636 Function *ReduceFn, Type *ReductionsBufferTy, 1637 AttributeList FuncAttrs); 1638 1639 /// This function emits a helper that reduces all the reduction variables from 1640 /// the team into the provided global buffer for the reduction variables. 1641 /// 1642 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 1643 /// void *GlobPtrs[]; 1644 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1645 /// ... 1646 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1647 /// reduce_function(reduce_data, GlobPtrs); 1648 /// 1649 /// \param ReductionInfos Array type containing the ReductionOps. 1650 /// \param ReduceFn The reduction function. 1651 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1652 /// \param FuncAttrs Optional param to specify any function attributes that 1653 /// need to be copied to the new function. 1654 /// 1655 /// \return The GlobalToListReduce function. 1656 Function * 1657 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1658 Function *ReduceFn, Type *ReductionsBufferTy, 1659 AttributeList FuncAttrs); 1660 1661 /// Get the function name of a reduction function. 1662 std::string getReductionFuncName(StringRef Name) const; 1663 1664 /// Emits reduction function. 1665 /// \param ReducerName Name of the function calling the reduction. 1666 /// \param ReductionInfos Array type containing the ReductionOps. 1667 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR 1668 /// CodeGenCB kind. 1669 /// \param FuncAttrs Optional param to specify any function attributes that 1670 /// need to be copied to the new function. 1671 /// 1672 /// \return The reduction function. 1673 Expected<Function *> createReductionFunction( 1674 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, 1675 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1676 AttributeList FuncAttrs = {}); 1677 1678 public: 1679 /// 1680 /// Design of OpenMP reductions on the GPU 1681 /// 1682 /// Consider a typical OpenMP program with one or more reduction 1683 /// clauses: 1684 /// 1685 /// float foo; 1686 /// double bar; 1687 /// #pragma omp target teams distribute parallel for \ 1688 /// reduction(+:foo) reduction(*:bar) 1689 /// for (int i = 0; i < N; i++) { 1690 /// foo += A[i]; bar *= B[i]; 1691 /// } 1692 /// 1693 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1694 /// all teams. In our OpenMP implementation on the NVPTX device an 1695 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1696 /// within a team are mapped to CUDA threads within a threadblock. 1697 /// Our goal is to efficiently aggregate values across all OpenMP 1698 /// threads such that: 1699 /// 1700 /// - the compiler and runtime are logically concise, and 1701 /// - the reduction is performed efficiently in a hierarchical 1702 /// manner as follows: within OpenMP threads in the same warp, 1703 /// across warps in a threadblock, and finally across teams on 1704 /// the NVPTX device. 1705 /// 1706 /// Introduction to Decoupling 1707 /// 1708 /// We would like to decouple the compiler and the runtime so that the 1709 /// latter is ignorant of the reduction variables (number, data types) 1710 /// and the reduction operators. This allows a simpler interface 1711 /// and implementation while still attaining good performance. 1712 /// 1713 /// Pseudocode for the aforementioned OpenMP program generated by the 1714 /// compiler is as follows: 1715 /// 1716 /// 1. Create private copies of reduction variables on each OpenMP 1717 /// thread: 'foo_private', 'bar_private' 1718 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1719 /// to it and writes the result in 'foo_private' and 'bar_private' 1720 /// respectively. 1721 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1722 /// and store the result on the team master: 1723 /// 1724 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1725 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1726 /// 1727 /// where: 1728 /// struct ReduceData { 1729 /// double *foo; 1730 /// double *bar; 1731 /// } reduceData 1732 /// reduceData.foo = &foo_private 1733 /// reduceData.bar = &bar_private 1734 /// 1735 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1736 /// auxiliary functions generated by the compiler that operate on 1737 /// variables of type 'ReduceData'. They aid the runtime perform 1738 /// algorithmic steps in a data agnostic manner. 1739 /// 1740 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1741 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1742 /// same warp. It takes the following arguments as input: 1743 /// 1744 /// a. variable of type 'ReduceData' on the calling lane, 1745 /// b. its lane_id, 1746 /// c. an offset relative to the current lane_id to generate a 1747 /// remote_lane_id. The remote lane contains the second 1748 /// variable of type 'ReduceData' that is to be reduced. 1749 /// d. an algorithm version parameter determining which reduction 1750 /// algorithm to use. 1751 /// 1752 /// 'shuffleReduceFn' retrieves data from the remote lane using 1753 /// efficient GPU shuffle intrinsics and reduces, using the 1754 /// algorithm specified by the 4th parameter, the two operands 1755 /// element-wise. The result is written to the first operand. 1756 /// 1757 /// Different reduction algorithms are implemented in different 1758 /// runtime functions, all calling 'shuffleReduceFn' to perform 1759 /// the essential reduction step. Therefore, based on the 4th 1760 /// parameter, this function behaves slightly differently to 1761 /// cooperate with the runtime to ensure correctness under 1762 /// different circumstances. 1763 /// 1764 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1765 /// reduced variables across warps. It tunnels, through CUDA 1766 /// shared memory, the thread-private data of type 'ReduceData' 1767 /// from lane 0 of each warp to a lane in the first warp. 1768 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1769 /// The last team writes the global reduced value to memory. 1770 /// 1771 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1772 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1773 /// scratchpadCopyFn, loadAndReduceFn) 1774 /// 1775 /// 'scratchpadCopyFn' is a helper that stores reduced 1776 /// data from the team master to a scratchpad array in 1777 /// global memory. 1778 /// 1779 /// 'loadAndReduceFn' is a helper that loads data from 1780 /// the scratchpad array and reduces it with the input 1781 /// operand. 1782 /// 1783 /// These compiler generated functions hide address 1784 /// calculation and alignment information from the runtime. 1785 /// 5. if ret == 1: 1786 /// The team master of the last team stores the reduced 1787 /// result to the globals in memory. 1788 /// foo += reduceData.foo; bar *= reduceData.bar 1789 /// 1790 /// 1791 /// Warp Reduction Algorithms 1792 /// 1793 /// On the warp level, we have three algorithms implemented in the 1794 /// OpenMP runtime depending on the number of active lanes: 1795 /// 1796 /// Full Warp Reduction 1797 /// 1798 /// The reduce algorithm within a warp where all lanes are active 1799 /// is implemented in the runtime as follows: 1800 /// 1801 /// full_warp_reduce(void *reduce_data, 1802 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1803 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1804 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1805 /// } 1806 /// 1807 /// The algorithm completes in log(2, WARPSIZE) steps. 1808 /// 1809 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1810 /// not used therefore we save instructions by not retrieving lane_id 1811 /// from the corresponding special registers. The 4th parameter, which 1812 /// represents the version of the algorithm being used, is set to 0 to 1813 /// signify full warp reduction. 1814 /// 1815 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1816 /// 1817 /// #reduce_elem refers to an element in the local lane's data structure 1818 /// #remote_elem is retrieved from a remote lane 1819 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1820 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1821 /// 1822 /// Contiguous Partial Warp Reduction 1823 /// 1824 /// This reduce algorithm is used within a warp where only the first 1825 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1826 /// number of OpenMP threads in a parallel region is not a multiple of 1827 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1828 /// 1829 /// void 1830 /// contiguous_partial_reduce(void *reduce_data, 1831 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1832 /// int size, int lane_id) { 1833 /// int curr_size; 1834 /// int offset; 1835 /// curr_size = size; 1836 /// mask = curr_size/2; 1837 /// while (offset>0) { 1838 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1839 /// curr_size = (curr_size+1)/2; 1840 /// offset = curr_size/2; 1841 /// } 1842 /// } 1843 /// 1844 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1845 /// 1846 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1847 /// if (lane_id < offset) 1848 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1849 /// else 1850 /// reduce_elem = remote_elem 1851 /// 1852 /// This algorithm assumes that the data to be reduced are located in a 1853 /// contiguous subset of lanes starting from the first. When there is 1854 /// an odd number of active lanes, the data in the last lane is not 1855 /// aggregated with any other lane's dat but is instead copied over. 1856 /// 1857 /// Dispersed Partial Warp Reduction 1858 /// 1859 /// This algorithm is used within a warp when any discontiguous subset of 1860 /// lanes are active. It is used to implement the reduction operation 1861 /// across lanes in an OpenMP simd region or in a nested parallel region. 1862 /// 1863 /// void 1864 /// dispersed_partial_reduce(void *reduce_data, 1865 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1866 /// int size, remote_id; 1867 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1868 /// do { 1869 /// remote_id = next_active_lane_id_right_after_me(); 1870 /// # the above function returns 0 of no active lane 1871 /// # is present right after the current lane. 1872 /// size = number_of_active_lanes_in_this_warp(); 1873 /// logical_lane_id /= 2; 1874 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1875 /// remote_id-1-threadIdx.x, 2); 1876 /// } while (logical_lane_id % 2 == 0 && size > 1); 1877 /// } 1878 /// 1879 /// There is no assumption made about the initial state of the reduction. 1880 /// Any number of lanes (>=1) could be active at any position. The reduction 1881 /// result is returned in the first active lane. 1882 /// 1883 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1884 /// 1885 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1886 /// if (lane_id % 2 == 0 && offset > 0) 1887 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1888 /// else 1889 /// reduce_elem = remote_elem 1890 /// 1891 /// 1892 /// Intra-Team Reduction 1893 /// 1894 /// This function, as implemented in the runtime call 1895 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1896 /// threads in a team. It first reduces within a warp using the 1897 /// aforementioned algorithms. We then proceed to gather all such 1898 /// reduced values at the first warp. 1899 /// 1900 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1901 /// data from each of the "warp master" (zeroth lane of each warp, where 1902 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1903 /// a mathematical sense) the problem of reduction across warp masters in 1904 /// a block to the problem of warp reduction. 1905 /// 1906 /// 1907 /// Inter-Team Reduction 1908 /// 1909 /// Once a team has reduced its data to a single value, it is stored in 1910 /// a global scratchpad array. Since each team has a distinct slot, this 1911 /// can be done without locking. 1912 /// 1913 /// The last team to write to the scratchpad array proceeds to reduce the 1914 /// scratchpad array. One or more workers in the last team use the helper 1915 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1916 /// the k'th worker reduces every k'th element. 1917 /// 1918 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1919 /// reduce across workers and compute a globally reduced value. 1920 /// 1921 /// \param Loc The location where the reduction was 1922 /// encountered. Must be within the associate 1923 /// directive and after the last local access to the 1924 /// reduction variables. 1925 /// \param AllocaIP An insertion point suitable for allocas usable 1926 /// in reductions. 1927 /// \param CodeGenIP An insertion point suitable for code 1928 /// generation. \param ReductionInfos A list of info on each reduction 1929 /// variable. \param IsNoWait Optional flag set if the reduction is 1930 /// marked as 1931 /// nowait. 1932 /// \param IsTeamsReduction Optional flag set if it is a teams 1933 /// reduction. 1934 /// \param GridValue Optional GPU grid value. 1935 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be 1936 /// used for teams reduction. 1937 /// \param SrcLocInfo Source location information global. 1938 LLVM_ABI InsertPointOrErrorTy createReductionsGPU( 1939 const LocationDescription &Loc, InsertPointTy AllocaIP, 1940 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, 1941 bool IsNoWait = false, bool IsTeamsReduction = false, 1942 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1943 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, 1944 Value *SrcLocInfo = nullptr); 1945 1946 // TODO: provide atomic and non-atomic reduction generators for reduction 1947 // operators defined by the OpenMP specification. 1948 1949 /// Generator for '#omp reduction'. 1950 /// 1951 /// Emits the IR instructing the runtime to perform the specific kind of 1952 /// reductions. Expects reduction variables to have been privatized and 1953 /// initialized to reduction-neutral values separately. Emits the calls to 1954 /// runtime functions as well as the reduction function and the basic blocks 1955 /// performing the reduction atomically and non-atomically. 1956 /// 1957 /// The code emitted for the following: 1958 /// 1959 /// \code 1960 /// type var_1; 1961 /// type var_2; 1962 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2) 1963 /// /* body */; 1964 /// \endcode 1965 /// 1966 /// corresponds to the following sketch. 1967 /// 1968 /// \code 1969 /// void _outlined_par() { 1970 /// // N is the number of different reductions. 1971 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...}; 1972 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array, 1973 /// _omp_reduction_func, 1974 /// _gomp_critical_user.reduction.var)) { 1975 /// case 1: { 1976 /// var_1 = var_1 <reduction-op> privatized_var_1; 1977 /// var_2 = var_2 <reduction-op> privatized_var_2; 1978 /// // ... 1979 /// __kmpc_end_reduce(...); 1980 /// break; 1981 /// } 1982 /// case 2: { 1983 /// _Atomic<ReductionOp>(var_1, privatized_var_1); 1984 /// _Atomic<ReductionOp>(var_2, privatized_var_2); 1985 /// // ... 1986 /// break; 1987 /// } 1988 /// default: break; 1989 /// } 1990 /// } 1991 /// 1992 /// void _omp_reduction_func(void **lhs, void **rhs) { 1993 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0]; 1994 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1]; 1995 /// // ... 1996 /// } 1997 /// \endcode 1998 /// 1999 /// \param Loc The location where the reduction was 2000 /// encountered. Must be within the associate 2001 /// directive and after the last local access to the 2002 /// reduction variables. 2003 /// \param AllocaIP An insertion point suitable for allocas usable 2004 /// in reductions. 2005 /// \param ReductionInfos A list of info on each reduction variable. 2006 /// \param IsNoWait A flag set if the reduction is marked as nowait. 2007 /// \param IsByRef A flag set if the reduction is using reference 2008 /// or direct value. 2009 /// \param IsTeamsReduction Optional flag set if it is a teams 2010 /// reduction. 2011 LLVM_ABI InsertPointOrErrorTy createReductions( 2012 const LocationDescription &Loc, InsertPointTy AllocaIP, 2013 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef, 2014 bool IsNoWait = false, bool IsTeamsReduction = false); 2015 2016 ///} 2017 2018 /// Return the insertion point used by the underlying IRBuilder. getInsertionPoint()2019 InsertPointTy getInsertionPoint() { return Builder.saveIP(); } 2020 2021 /// Update the internal location to \p Loc. updateToLocation(const LocationDescription & Loc)2022 bool updateToLocation(const LocationDescription &Loc) { 2023 Builder.restoreIP(Loc.IP); 2024 Builder.SetCurrentDebugLocation(Loc.DL); 2025 return Loc.IP.getBlock() != nullptr; 2026 } 2027 2028 /// Return the function declaration for the runtime function with \p FnID. 2029 LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, 2030 omp::RuntimeFunction FnID); 2031 2032 LLVM_ABI Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID); 2033 2034 /// Return the (LLVM-IR) string describing the source location \p LocStr. 2035 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef LocStr, 2036 uint32_t &SrcLocStrSize); 2037 2038 /// Return the (LLVM-IR) string describing the default source location. 2039 LLVM_ABI Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize); 2040 2041 /// Return the (LLVM-IR) string describing the source location identified by 2042 /// the arguments. 2043 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef FunctionName, 2044 StringRef FileName, unsigned Line, 2045 unsigned Column, 2046 uint32_t &SrcLocStrSize); 2047 2048 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as 2049 /// fallback if \p DL does not specify the function name. 2050 LLVM_ABI Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize, 2051 Function *F = nullptr); 2052 2053 /// Return the (LLVM-IR) string describing the source location \p Loc. 2054 LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc, 2055 uint32_t &SrcLocStrSize); 2056 2057 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags. 2058 /// TODO: Create a enum class for the Reserve2Flags 2059 LLVM_ABI Constant *getOrCreateIdent(Constant *SrcLocStr, 2060 uint32_t SrcLocStrSize, 2061 omp::IdentFlag Flags = omp::IdentFlag(0), 2062 unsigned Reserve2Flags = 0); 2063 2064 /// Create a hidden global flag \p Name in the module with initial value \p 2065 /// Value. 2066 LLVM_ABI GlobalValue *createGlobalFlag(unsigned Value, StringRef Name); 2067 2068 /// Emit the llvm.used metadata. 2069 LLVM_ABI void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List); 2070 2071 /// Emit the kernel execution mode. 2072 LLVM_ABI GlobalVariable * 2073 emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode); 2074 2075 /// Generate control flow and cleanup for cancellation. 2076 /// 2077 /// \param CancelFlag Flag indicating if the cancellation is performed. 2078 /// \param CanceledDirective The kind of directive that is cancled. 2079 /// \param ExitCB Extra code to be generated in the exit block. 2080 /// 2081 /// \return an error, if any were triggered during execution. 2082 LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, 2083 omp::Directive CanceledDirective, 2084 FinalizeCallbackTy ExitCB = {}); 2085 2086 /// Generate a target region entry call. 2087 /// 2088 /// \param Loc The location at which the request originated and is fulfilled. 2089 /// \param AllocaIP The insertion point to be used for alloca instructions. 2090 /// \param Return Return value of the created function returned by reference. 2091 /// \param DeviceID Identifier for the device via the 'device' clause. 2092 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause 2093 /// or 0 if unspecified and -1 if there is no 'teams' clause. 2094 /// \param NumThreads Number of threads via the 'thread_limit' clause. 2095 /// \param HostPtr Pointer to the host-side pointer of the target kernel. 2096 /// \param KernelArgs Array of arguments to the kernel. 2097 LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, 2098 InsertPointTy AllocaIP, 2099 Value *&Return, Value *Ident, 2100 Value *DeviceID, Value *NumTeams, 2101 Value *NumThreads, Value *HostPtr, 2102 ArrayRef<Value *> KernelArgs); 2103 2104 /// Generate a flush runtime call. 2105 /// 2106 /// \param Loc The location at which the request originated and is fulfilled. 2107 LLVM_ABI void emitFlush(const LocationDescription &Loc); 2108 2109 /// The finalization stack made up of finalize callbacks currently in-flight, 2110 /// wrapped into FinalizationInfo objects that reference also the finalization 2111 /// target block and the kind of cancellable directive. 2112 SmallVector<FinalizationInfo, 8> FinalizationStack; 2113 2114 /// Return true if the last entry in the finalization stack is of kind \p DK 2115 /// and cancellable. isLastFinalizationInfoCancellable(omp::Directive DK)2116 bool isLastFinalizationInfoCancellable(omp::Directive DK) { 2117 return !FinalizationStack.empty() && 2118 FinalizationStack.back().IsCancellable && 2119 FinalizationStack.back().DK == DK; 2120 } 2121 2122 /// Generate a taskwait runtime call. 2123 /// 2124 /// \param Loc The location at which the request originated and is fulfilled. 2125 LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc); 2126 2127 /// Generate a taskyield runtime call. 2128 /// 2129 /// \param Loc The location at which the request originated and is fulfilled. 2130 LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc); 2131 2132 /// Return the current thread ID. 2133 /// 2134 /// \param Ident The ident (ident_t*) describing the query origin. 2135 LLVM_ABI Value *getOrCreateThreadID(Value *Ident); 2136 2137 /// The OpenMPIRBuilder Configuration 2138 OpenMPIRBuilderConfig Config; 2139 2140 /// The underlying LLVM-IR module 2141 Module &M; 2142 2143 /// The LLVM-IR Builder used to create IR. 2144 IRBuilder<> Builder; 2145 2146 /// Map to remember source location strings 2147 StringMap<Constant *> SrcLocStrMap; 2148 2149 /// Map to remember existing ident_t*. 2150 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap; 2151 2152 /// Info manager to keep track of target regions. 2153 OffloadEntriesInfoManager OffloadInfoManager; 2154 2155 /// The target triple of the underlying module. 2156 const Triple T; 2157 2158 /// Helper that contains information about regions we need to outline 2159 /// during finalization. 2160 struct OutlineInfo { 2161 using PostOutlineCBTy = std::function<void(Function &)>; 2162 PostOutlineCBTy PostOutlineCB; 2163 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB; 2164 SmallVector<Value *, 2> ExcludeArgsFromAggregate; 2165 2166 /// Collect all blocks in between EntryBB and ExitBB in both the given 2167 /// vector and set. 2168 LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet, 2169 SmallVectorImpl<BasicBlock *> &BlockVector); 2170 2171 /// Return the function that contains the region to be outlined. getFunctionOutlineInfo2172 Function *getFunction() const { return EntryBB->getParent(); } 2173 }; 2174 2175 /// Collection of regions that need to be outlined during finalization. 2176 SmallVector<OutlineInfo, 16> OutlineInfos; 2177 2178 /// A collection of candidate target functions that's constant allocas will 2179 /// attempt to be raised on a call of finalize after all currently enqueued 2180 /// outline info's have been processed. 2181 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates; 2182 2183 /// Collection of owned canonical loop objects that eventually need to be 2184 /// free'd. 2185 std::forward_list<CanonicalLoopInfo> LoopInfos; 2186 2187 /// Add a new region that will be outlined later. addOutlineInfo(OutlineInfo && OI)2188 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); } 2189 2190 /// An ordered map of auto-generated variables to their unique names. 2191 /// It stores variables with the following names: 1) ".gomp_critical_user_" + 2192 /// <critical_section_name> + ".var" for "omp critical" directives; 2) 2193 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate 2194 /// variables. 2195 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars; 2196 2197 /// Computes the size of type in bytes. 2198 LLVM_ABI Value *getSizeInBytes(Value *BasePtr); 2199 2200 // Emit a branch from the current block to the Target block only if 2201 // the current block has a terminator. 2202 LLVM_ABI void emitBranch(BasicBlock *Target); 2203 2204 // If BB has no use then delete it and return. Else place BB after the current 2205 // block, if possible, or else at the end of the function. Also add a branch 2206 // from current block to BB if current block does not have a terminator. 2207 LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, 2208 bool IsFinished = false); 2209 2210 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy 2211 /// Here is the logic: 2212 /// if (Cond) { 2213 /// ThenGen(); 2214 /// } else { 2215 /// ElseGen(); 2216 /// } 2217 /// 2218 /// \return an error, if any were triggered during execution. 2219 LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, 2220 BodyGenCallbackTy ElseGen, 2221 InsertPointTy AllocaIP = {}); 2222 2223 /// Create the global variable holding the offload mappings information. 2224 LLVM_ABI GlobalVariable * 2225 createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 2226 std::string VarName); 2227 2228 /// Create the global variable holding the offload names information. 2229 LLVM_ABI GlobalVariable * 2230 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 2231 std::string VarName); 2232 2233 struct MapperAllocas { 2234 AllocaInst *ArgsBase = nullptr; 2235 AllocaInst *Args = nullptr; 2236 AllocaInst *ArgSizes = nullptr; 2237 }; 2238 2239 /// Create the allocas instruction used in call to mapper functions. 2240 LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, 2241 InsertPointTy AllocaIP, 2242 unsigned NumOperands, 2243 struct MapperAllocas &MapperAllocas); 2244 2245 /// Create the call for the target mapper function. 2246 /// \param Loc The source location description. 2247 /// \param MapperFunc Function to be called. 2248 /// \param SrcLocInfo Source location information global. 2249 /// \param MaptypesArg The argument types. 2250 /// \param MapnamesArg The argument names. 2251 /// \param MapperAllocas The AllocaInst used for the call. 2252 /// \param DeviceID Device ID for the call. 2253 /// \param NumOperands Number of operands in the call. 2254 LLVM_ABI void emitMapperCall(const LocationDescription &Loc, 2255 Function *MapperFunc, Value *SrcLocInfo, 2256 Value *MaptypesArg, Value *MapnamesArg, 2257 struct MapperAllocas &MapperAllocas, 2258 int64_t DeviceID, unsigned NumOperands); 2259 2260 /// Container for the arguments used to pass data to the runtime library. 2261 struct TargetDataRTArgs { 2262 /// The array of base pointer passed to the runtime library. 2263 Value *BasePointersArray = nullptr; 2264 /// The array of section pointers passed to the runtime library. 2265 Value *PointersArray = nullptr; 2266 /// The array of sizes passed to the runtime library. 2267 Value *SizesArray = nullptr; 2268 /// The array of map types passed to the runtime library for the beginning 2269 /// of the region or for the entire region if there are no separate map 2270 /// types for the region end. 2271 Value *MapTypesArray = nullptr; 2272 /// The array of map types passed to the runtime library for the end of the 2273 /// region, or nullptr if there are no separate map types for the region 2274 /// end. 2275 Value *MapTypesArrayEnd = nullptr; 2276 /// The array of user-defined mappers passed to the runtime library. 2277 Value *MappersArray = nullptr; 2278 /// The array of original declaration names of mapped pointers sent to the 2279 /// runtime library for debugging 2280 Value *MapNamesArray = nullptr; 2281 TargetDataRTArgsTargetDataRTArgs2282 explicit TargetDataRTArgs() {} TargetDataRTArgsTargetDataRTArgs2283 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, 2284 Value *SizesArray, Value *MapTypesArray, 2285 Value *MapTypesArrayEnd, Value *MappersArray, 2286 Value *MapNamesArray) 2287 : BasePointersArray(BasePointersArray), PointersArray(PointersArray), 2288 SizesArray(SizesArray), MapTypesArray(MapTypesArray), 2289 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray), 2290 MapNamesArray(MapNamesArray) {} 2291 }; 2292 2293 /// Container to pass the default attributes with which a kernel must be 2294 /// launched, used to set kernel attributes and populate associated static 2295 /// structures. 2296 /// 2297 /// For max values, < 0 means unset, == 0 means set but unknown at compile 2298 /// time. The number of max values will be 1 except for the case where 2299 /// ompx_bare is set. 2300 struct TargetKernelDefaultAttrs { 2301 omp::OMPTgtExecModeFlags ExecFlags = 2302 omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 2303 SmallVector<int32_t, 3> MaxTeams = {-1}; 2304 int32_t MinTeams = 1; 2305 SmallVector<int32_t, 3> MaxThreads = {-1}; 2306 int32_t MinThreads = 1; 2307 int32_t ReductionDataSize = 0; 2308 int32_t ReductionBufferLength = 0; 2309 }; 2310 2311 /// Container to pass LLVM IR runtime values or constants related to the 2312 /// number of teams and threads with which the kernel must be launched, as 2313 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD 2314 /// kernel. These must be defined in the host prior to the call to the kernel 2315 /// launch OpenMP RTL function. 2316 struct TargetKernelRuntimeAttrs { 2317 SmallVector<Value *, 3> MaxTeams = {nullptr}; 2318 Value *MinTeams = nullptr; 2319 SmallVector<Value *, 3> TargetThreadLimit = {nullptr}; 2320 SmallVector<Value *, 3> TeamsThreadLimit = {nullptr}; 2321 2322 /// 'parallel' construct 'num_threads' clause value, if present and it is an 2323 /// SPMD kernel. 2324 Value *MaxThreads = nullptr; 2325 2326 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if 2327 /// it is a generic kernel. 2328 Value *LoopTripCount = nullptr; 2329 }; 2330 2331 /// Data structure that contains the needed information to construct the 2332 /// kernel args vector. 2333 struct TargetKernelArgs { 2334 /// Number of arguments passed to the runtime library. 2335 unsigned NumTargetItems = 0; 2336 /// Arguments passed to the runtime library 2337 TargetDataRTArgs RTArgs; 2338 /// The number of iterations 2339 Value *NumIterations = nullptr; 2340 /// The number of teams. 2341 ArrayRef<Value *> NumTeams; 2342 /// The number of threads. 2343 ArrayRef<Value *> NumThreads; 2344 /// The size of the dynamic shared memory. 2345 Value *DynCGGroupMem = nullptr; 2346 /// True if the kernel has 'no wait' clause. 2347 bool HasNoWait = false; 2348 2349 // Constructors for TargetKernelArgs. TargetKernelArgsTargetKernelArgs2350 TargetKernelArgs() {} TargetKernelArgsTargetKernelArgs2351 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, 2352 Value *NumIterations, ArrayRef<Value *> NumTeams, 2353 ArrayRef<Value *> NumThreads, Value *DynCGGroupMem, 2354 bool HasNoWait) 2355 : NumTargetItems(NumTargetItems), RTArgs(RTArgs), 2356 NumIterations(NumIterations), NumTeams(NumTeams), 2357 NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem), 2358 HasNoWait(HasNoWait) {} 2359 }; 2360 2361 /// Create the kernel args vector used by emitTargetKernel. This function 2362 /// creates various constant values that are used in the resulting args 2363 /// vector. 2364 LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs, 2365 IRBuilderBase &Builder, 2366 SmallVector<Value *> &ArgsVector); 2367 2368 /// Struct that keeps the information that should be kept throughout 2369 /// a 'target data' region. 2370 class TargetDataInfo { 2371 /// Set to true if device pointer information have to be obtained. 2372 bool RequiresDevicePointerInfo = false; 2373 /// Set to true if Clang emits separate runtime calls for the beginning and 2374 /// end of the region. These calls might have separate map type arrays. 2375 bool SeparateBeginEndCalls = false; 2376 2377 public: 2378 TargetDataRTArgs RTArgs; 2379 2380 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4> 2381 DevicePtrInfoMap; 2382 2383 /// Indicate whether any user-defined mapper exists. 2384 bool HasMapper = false; 2385 /// The total number of pointers passed to the runtime library. 2386 unsigned NumberOfPtrs = 0u; 2387 2388 bool EmitDebug = false; 2389 2390 /// Whether the `target ... data` directive has a `nowait` clause. 2391 bool HasNoWait = false; 2392 TargetDataInfo()2393 explicit TargetDataInfo() {} TargetDataInfo(bool RequiresDevicePointerInfo,bool SeparateBeginEndCalls)2394 explicit TargetDataInfo(bool RequiresDevicePointerInfo, 2395 bool SeparateBeginEndCalls) 2396 : RequiresDevicePointerInfo(RequiresDevicePointerInfo), 2397 SeparateBeginEndCalls(SeparateBeginEndCalls) {} 2398 /// Clear information about the data arrays. clearArrayInfo()2399 void clearArrayInfo() { 2400 RTArgs = TargetDataRTArgs(); 2401 HasMapper = false; 2402 NumberOfPtrs = 0u; 2403 } 2404 /// Return true if the current target data information has valid arrays. isValid()2405 bool isValid() { 2406 return RTArgs.BasePointersArray && RTArgs.PointersArray && 2407 RTArgs.SizesArray && RTArgs.MapTypesArray && 2408 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs; 2409 } requiresDevicePointerInfo()2410 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; } separateBeginEndCalls()2411 bool separateBeginEndCalls() { return SeparateBeginEndCalls; } 2412 }; 2413 2414 enum class DeviceInfoTy { None, Pointer, Address }; 2415 using MapValuesArrayTy = SmallVector<Value *, 4>; 2416 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>; 2417 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>; 2418 using MapNamesArrayTy = SmallVector<Constant *, 4>; 2419 using MapDimArrayTy = SmallVector<uint64_t, 4>; 2420 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>; 2421 2422 /// This structure contains combined information generated for mappable 2423 /// clauses, including base pointers, pointers, sizes, map types, user-defined 2424 /// mappers, and non-contiguous information. 2425 struct MapInfosTy { 2426 struct StructNonContiguousInfo { 2427 bool IsNonContiguous = false; 2428 MapDimArrayTy Dims; 2429 MapNonContiguousArrayTy Offsets; 2430 MapNonContiguousArrayTy Counts; 2431 MapNonContiguousArrayTy Strides; 2432 }; 2433 MapValuesArrayTy BasePointers; 2434 MapValuesArrayTy Pointers; 2435 MapDeviceInfoArrayTy DevicePointers; 2436 MapValuesArrayTy Sizes; 2437 MapFlagsArrayTy Types; 2438 MapNamesArrayTy Names; 2439 StructNonContiguousInfo NonContigInfo; 2440 2441 /// Append arrays in \a CurInfo. appendMapInfosTy2442 void append(MapInfosTy &CurInfo) { 2443 BasePointers.append(CurInfo.BasePointers.begin(), 2444 CurInfo.BasePointers.end()); 2445 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end()); 2446 DevicePointers.append(CurInfo.DevicePointers.begin(), 2447 CurInfo.DevicePointers.end()); 2448 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end()); 2449 Types.append(CurInfo.Types.begin(), CurInfo.Types.end()); 2450 Names.append(CurInfo.Names.begin(), CurInfo.Names.end()); 2451 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(), 2452 CurInfo.NonContigInfo.Dims.end()); 2453 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(), 2454 CurInfo.NonContigInfo.Offsets.end()); 2455 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(), 2456 CurInfo.NonContigInfo.Counts.end()); 2457 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(), 2458 CurInfo.NonContigInfo.Strides.end()); 2459 } 2460 }; 2461 using MapInfosOrErrorTy = Expected<MapInfosTy &>; 2462 2463 /// Callback function type for functions emitting the host fallback code that 2464 /// is executed when the kernel launch fails. It takes an insertion point as 2465 /// parameter where the code should be emitted. It returns an insertion point 2466 /// that points right after after the emitted code. 2467 using EmitFallbackCallbackTy = 2468 function_ref<InsertPointOrErrorTy(InsertPointTy)>; 2469 2470 // Callback function type for emitting and fetching user defined custom 2471 // mappers. 2472 using CustomMapperCallbackTy = 2473 function_ref<Expected<Function *>(unsigned int)>; 2474 2475 /// Generate a target region entry call and host fallback call. 2476 /// 2477 /// \param Loc The location at which the request originated and is fulfilled. 2478 /// \param OutlinedFnID The ooulined function ID. 2479 /// \param EmitTargetCallFallbackCB Call back function to generate host 2480 /// fallback code. 2481 /// \param Args Data structure holding information about the kernel arguments. 2482 /// \param DeviceID Identifier for the device via the 'device' clause. 2483 /// \param RTLoc Source location identifier 2484 /// \param AllocaIP The insertion point to be used for alloca instructions. 2485 LLVM_ABI InsertPointOrErrorTy emitKernelLaunch( 2486 const LocationDescription &Loc, Value *OutlinedFnID, 2487 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, 2488 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP); 2489 2490 /// Callback type for generating the bodies of device directives that require 2491 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses). 2492 /// 2493 /// \param DeviceID The ID of the device on which the target region will 2494 /// execute. 2495 /// \param RTLoc Source location identifier 2496 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the 2497 /// generated task. 2498 /// 2499 /// \return an error, if any were triggered during execution. 2500 using TargetTaskBodyCallbackTy = 2501 function_ref<Error(Value *DeviceID, Value *RTLoc, 2502 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>; 2503 2504 /// Generate a target-task for the target construct 2505 /// 2506 /// \param TaskBodyCB Callback to generate the actual body of the target task. 2507 /// \param DeviceID Identifier for the device via the 'device' clause. 2508 /// \param RTLoc Source location identifier 2509 /// \param AllocaIP The insertion point to be used for alloca instructions. 2510 /// \param Dependencies Vector of DependData objects holding information of 2511 /// dependencies as specified by the 'depend' clause. 2512 /// \param HasNoWait True if the target construct had 'nowait' on it, false 2513 /// otherwise 2514 LLVM_ABI InsertPointOrErrorTy emitTargetTask( 2515 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, 2516 OpenMPIRBuilder::InsertPointTy AllocaIP, 2517 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies, 2518 const TargetDataRTArgs &RTArgs, bool HasNoWait); 2519 2520 /// Emit the arguments to be passed to the runtime library based on the 2521 /// arrays of base pointers, pointers, sizes, map types, and mappers. If 2522 /// ForEndCall, emit map types to be passed for the end of the region instead 2523 /// of the beginning. 2524 LLVM_ABI void emitOffloadingArraysArgument( 2525 IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, 2526 OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false); 2527 2528 /// Emit an array of struct descriptors to be assigned to the offload args. 2529 LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, 2530 InsertPointTy CodeGenIP, 2531 MapInfosTy &CombinedInfo, 2532 TargetDataInfo &Info); 2533 2534 /// Emit the arrays used to pass the captures and map information to the 2535 /// offloading runtime library. If there is no map or capture information, 2536 /// return nullptr by reference. Accepts a reference to a MapInfosTy object 2537 /// that contains information generated for mappable clauses, 2538 /// including base pointers, pointers, sizes, map types, user-defined mappers. 2539 LLVM_ABI Error emitOffloadingArrays( 2540 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, 2541 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, 2542 bool IsNonContiguous = false, 2543 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr); 2544 2545 /// Allocates memory for and populates the arrays required for offloading 2546 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it 2547 /// emits their base addresses as arguments to be passed to the runtime 2548 /// library. In essence, this function is a combination of 2549 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably 2550 /// be preferred by clients of OpenMPIRBuilder. 2551 LLVM_ABI Error emitOffloadingArraysAndArgs( 2552 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, 2553 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, 2554 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false, 2555 bool ForEndCall = false, 2556 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr); 2557 2558 /// Creates offloading entry for the provided entry ID \a ID, address \a 2559 /// Addr, size \a Size, and flags \a Flags. 2560 LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, 2561 int32_t Flags, GlobalValue::LinkageTypes, 2562 StringRef Name = ""); 2563 2564 /// The kind of errors that can occur when emitting the offload entries and 2565 /// metadata. 2566 enum EmitMetadataErrorKind { 2567 EMIT_MD_TARGET_REGION_ERROR, 2568 EMIT_MD_DECLARE_TARGET_ERROR, 2569 EMIT_MD_GLOBAL_VAR_LINK_ERROR 2570 }; 2571 2572 /// Callback function type 2573 using EmitMetadataErrorReportFunctionTy = 2574 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>; 2575 2576 // Emit the offloading entries and metadata so that the device codegen side 2577 // can easily figure out what to emit. The produced metadata looks like 2578 // this: 2579 // 2580 // !omp_offload.info = !{!1, ...} 2581 // 2582 // We only generate metadata for function that contain target regions. 2583 LLVM_ABI void createOffloadEntriesAndInfoMetadata( 2584 EmitMetadataErrorReportFunctionTy &ErrorReportFunction); 2585 2586 public: 2587 /// Generator for __kmpc_copyprivate 2588 /// 2589 /// \param Loc The source location description. 2590 /// \param BufSize Number of elements in the buffer. 2591 /// \param CpyBuf List of pointers to data to be copied. 2592 /// \param CpyFn function to call for copying data. 2593 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise. 2594 /// 2595 /// \return The insertion position *after* the CopyPrivate call. 2596 2597 LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, 2598 llvm::Value *BufSize, 2599 llvm::Value *CpyBuf, 2600 llvm::Value *CpyFn, 2601 llvm::Value *DidIt); 2602 2603 /// Generator for '#omp single' 2604 /// 2605 /// \param Loc The source location description. 2606 /// \param BodyGenCB Callback that will generate the region code. 2607 /// \param FiniCB Callback to finalize variable copies. 2608 /// \param IsNowait If false, a barrier is emitted. 2609 /// \param CPVars copyprivate variables. 2610 /// \param CPFuncs copy functions to use for each copyprivate variable. 2611 /// 2612 /// \returns The insertion position *after* the single call. 2613 LLVM_ABI InsertPointOrErrorTy 2614 createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 2615 FinalizeCallbackTy FiniCB, bool IsNowait, 2616 ArrayRef<llvm::Value *> CPVars = {}, 2617 ArrayRef<llvm::Function *> CPFuncs = {}); 2618 2619 /// Generator for '#omp master' 2620 /// 2621 /// \param Loc The insert and source location description. 2622 /// \param BodyGenCB Callback that will generate the region code. 2623 /// \param FiniCB Callback to finalize variable copies. 2624 /// 2625 /// \returns The insertion position *after* the master. 2626 LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, 2627 BodyGenCallbackTy BodyGenCB, 2628 FinalizeCallbackTy FiniCB); 2629 2630 /// Generator for '#omp masked' 2631 /// 2632 /// \param Loc The insert and source location description. 2633 /// \param BodyGenCB Callback that will generate the region code. 2634 /// \param FiniCB Callback to finialize variable copies. 2635 /// 2636 /// \returns The insertion position *after* the masked. 2637 LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, 2638 BodyGenCallbackTy BodyGenCB, 2639 FinalizeCallbackTy FiniCB, 2640 Value *Filter); 2641 2642 /// Generator for '#omp critical' 2643 /// 2644 /// \param Loc The insert and source location description. 2645 /// \param BodyGenCB Callback that will generate the region body code. 2646 /// \param FiniCB Callback to finalize variable copies. 2647 /// \param CriticalName name of the lock used by the critical directive 2648 /// \param HintInst Hint Instruction for hint clause associated with critical 2649 /// 2650 /// \returns The insertion position *after* the critical. 2651 LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, 2652 BodyGenCallbackTy BodyGenCB, 2653 FinalizeCallbackTy FiniCB, 2654 StringRef CriticalName, 2655 Value *HintInst); 2656 2657 /// Generator for '#omp ordered depend (source | sink)' 2658 /// 2659 /// \param Loc The insert and source location description. 2660 /// \param AllocaIP The insertion point to be used for alloca instructions. 2661 /// \param NumLoops The number of loops in depend clause. 2662 /// \param StoreValues The value will be stored in vector address. 2663 /// \param Name The name of alloca instruction. 2664 /// \param IsDependSource If true, depend source; otherwise, depend sink. 2665 /// 2666 /// \return The insertion position *after* the ordered. 2667 LLVM_ABI InsertPointTy 2668 createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, 2669 unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues, 2670 const Twine &Name, bool IsDependSource); 2671 2672 /// Generator for '#omp ordered [threads | simd]' 2673 /// 2674 /// \param Loc The insert and source location description. 2675 /// \param BodyGenCB Callback that will generate the region code. 2676 /// \param FiniCB Callback to finalize variable copies. 2677 /// \param IsThreads If true, with threads clause or without clause; 2678 /// otherwise, with simd clause; 2679 /// 2680 /// \returns The insertion position *after* the ordered. 2681 LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd( 2682 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 2683 FinalizeCallbackTy FiniCB, bool IsThreads); 2684 2685 /// Generator for '#omp sections' 2686 /// 2687 /// \param Loc The insert and source location description. 2688 /// \param AllocaIP The insertion points to be used for alloca instructions. 2689 /// \param SectionCBs Callbacks that will generate body of each section. 2690 /// \param PrivCB Callback to copy a given variable (think copy constructor). 2691 /// \param FiniCB Callback to finalize variable copies. 2692 /// \param IsCancellable Flag to indicate a cancellable parallel region. 2693 /// \param IsNowait If true, barrier - to ensure all sections are executed 2694 /// before moving forward will not be generated. 2695 /// \returns The insertion position *after* the sections. 2696 LLVM_ABI InsertPointOrErrorTy 2697 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, 2698 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, 2699 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, 2700 bool IsCancellable, bool IsNowait); 2701 2702 /// Generator for '#omp section' 2703 /// 2704 /// \param Loc The insert and source location description. 2705 /// \param BodyGenCB Callback that will generate the region body code. 2706 /// \param FiniCB Callback to finalize variable copies. 2707 /// \returns The insertion position *after* the section. 2708 LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, 2709 BodyGenCallbackTy BodyGenCB, 2710 FinalizeCallbackTy FiniCB); 2711 2712 /// Generator for `#omp teams` 2713 /// 2714 /// \param Loc The location where the teams construct was encountered. 2715 /// \param BodyGenCB Callback that will generate the region code. 2716 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr, 2717 /// it is as if lower bound is specified as equal to upperbound. If 2718 /// this is non-null, then upperbound must also be non-null. 2719 /// \param NumTeamsUpper Upper bound on the number of teams. 2720 /// \param ThreadLimit on the number of threads that may participate in a 2721 /// contention group created by each team. 2722 /// \param IfExpr is the integer argument value of the if condition on the 2723 /// teams clause. 2724 LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, 2725 BodyGenCallbackTy BodyGenCB, 2726 Value *NumTeamsLower = nullptr, 2727 Value *NumTeamsUpper = nullptr, 2728 Value *ThreadLimit = nullptr, 2729 Value *IfExpr = nullptr); 2730 2731 /// Generator for `#omp distribute` 2732 /// 2733 /// \param Loc The location where the distribute construct was encountered. 2734 /// \param AllocaIP The insertion points to be used for alloca instructions. 2735 /// \param BodyGenCB Callback that will generate the region code. 2736 LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, 2737 InsertPointTy AllocaIP, 2738 BodyGenCallbackTy BodyGenCB); 2739 2740 /// Generate conditional branch and relevant BasicBlocks through which private 2741 /// threads copy the 'copyin' variables from Master copy to threadprivate 2742 /// copies. 2743 /// 2744 /// \param IP insertion block for copyin conditional 2745 /// \param MasterVarPtr a pointer to the master variable 2746 /// \param PrivateVarPtr a pointer to the threadprivate variable 2747 /// \param IntPtrTy Pointer size type 2748 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks 2749 // and copy.in.end block 2750 /// 2751 /// \returns The insertion point where copying operation to be emitted. 2752 LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, 2753 Value *MasterAddr, 2754 Value *PrivateAddr, 2755 llvm::IntegerType *IntPtrTy, 2756 bool BranchtoEnd = true); 2757 2758 /// Create a runtime call for kmpc_Alloc 2759 /// 2760 /// \param Loc The insert and source location description. 2761 /// \param Size Size of allocated memory space 2762 /// \param Allocator Allocator information instruction 2763 /// \param Name Name of call Instruction for OMP_alloc 2764 /// 2765 /// \returns CallInst to the OMP_Alloc call 2766 LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size, 2767 Value *Allocator, std::string Name = ""); 2768 2769 /// Create a runtime call for kmpc_free 2770 /// 2771 /// \param Loc The insert and source location description. 2772 /// \param Addr Address of memory space to be freed 2773 /// \param Allocator Allocator information instruction 2774 /// \param Name Name of call Instruction for OMP_Free 2775 /// 2776 /// \returns CallInst to the OMP_Free call 2777 LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr, 2778 Value *Allocator, std::string Name = ""); 2779 2780 /// Create a runtime call for kmpc_threadprivate_cached 2781 /// 2782 /// \param Loc The insert and source location description. 2783 /// \param Pointer pointer to data to be cached 2784 /// \param Size size of data to be cached 2785 /// \param Name Name of call Instruction for callinst 2786 /// 2787 /// \returns CallInst to the thread private cache call. 2788 LLVM_ABI CallInst * 2789 createCachedThreadPrivate(const LocationDescription &Loc, 2790 llvm::Value *Pointer, llvm::ConstantInt *Size, 2791 const llvm::Twine &Name = Twine("")); 2792 2793 /// Create a runtime call for __tgt_interop_init 2794 /// 2795 /// \param Loc The insert and source location description. 2796 /// \param InteropVar variable to be allocated 2797 /// \param InteropType type of interop operation 2798 /// \param Device devide to which offloading will occur 2799 /// \param NumDependences number of dependence variables 2800 /// \param DependenceAddress pointer to dependence variables 2801 /// \param HaveNowaitClause does nowait clause exist 2802 /// 2803 /// \returns CallInst to the __tgt_interop_init call 2804 LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc, 2805 Value *InteropVar, 2806 omp::OMPInteropType InteropType, 2807 Value *Device, Value *NumDependences, 2808 Value *DependenceAddress, 2809 bool HaveNowaitClause); 2810 2811 /// Create a runtime call for __tgt_interop_destroy 2812 /// 2813 /// \param Loc The insert and source location description. 2814 /// \param InteropVar variable to be allocated 2815 /// \param Device devide to which offloading will occur 2816 /// \param NumDependences number of dependence variables 2817 /// \param DependenceAddress pointer to dependence variables 2818 /// \param HaveNowaitClause does nowait clause exist 2819 /// 2820 /// \returns CallInst to the __tgt_interop_destroy call 2821 LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc, 2822 Value *InteropVar, Value *Device, 2823 Value *NumDependences, 2824 Value *DependenceAddress, 2825 bool HaveNowaitClause); 2826 2827 /// Create a runtime call for __tgt_interop_use 2828 /// 2829 /// \param Loc The insert and source location description. 2830 /// \param InteropVar variable to be allocated 2831 /// \param Device devide to which offloading will occur 2832 /// \param NumDependences number of dependence variables 2833 /// \param DependenceAddress pointer to dependence variables 2834 /// \param HaveNowaitClause does nowait clause exist 2835 /// 2836 /// \returns CallInst to the __tgt_interop_use call 2837 LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc, 2838 Value *InteropVar, Value *Device, 2839 Value *NumDependences, 2840 Value *DependenceAddress, 2841 bool HaveNowaitClause); 2842 2843 /// The `omp target` interface 2844 /// 2845 /// For more information about the usage of this interface, 2846 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h 2847 /// 2848 ///{ 2849 2850 /// Create a runtime call for kmpc_target_init 2851 /// 2852 /// \param Loc The insert and source location description. 2853 /// \param Attrs Structure containing the default attributes, including 2854 /// numbers of threads and teams to launch the kernel with. 2855 LLVM_ABI InsertPointTy createTargetInit( 2856 const LocationDescription &Loc, 2857 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs); 2858 2859 /// Create a runtime call for kmpc_target_deinit 2860 /// 2861 /// \param Loc The insert and source location description. 2862 /// \param TeamsReductionDataSize The maximal size of all the reduction data 2863 /// for teams reduction. 2864 /// \param TeamsReductionBufferLength The number of elements (each of up to 2865 /// \p TeamsReductionDataSize size), in the teams reduction buffer. 2866 LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, 2867 int32_t TeamsReductionDataSize = 0, 2868 int32_t TeamsReductionBufferLength = 1024); 2869 2870 ///} 2871 2872 /// Helpers to read/write kernel annotations from the IR. 2873 /// 2874 ///{ 2875 2876 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none 2877 /// is set. 2878 LLVM_ABI static std::pair<int32_t, int32_t> 2879 readThreadBoundsForKernel(const Triple &T, Function &Kernel); 2880 LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T, 2881 Function &Kernel, int32_t LB, 2882 int32_t UB); 2883 2884 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none 2885 /// is set. 2886 LLVM_ABI static std::pair<int32_t, int32_t> 2887 readTeamBoundsForKernel(const Triple &T, Function &Kernel); 2888 LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel, 2889 int32_t LB, int32_t UB); 2890 ///} 2891 2892 private: 2893 // Sets the function attributes expected for the outlined function 2894 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn); 2895 2896 // Creates the function ID/Address for the given outlined function. 2897 // In the case of an embedded device function the address of the function is 2898 // used, in the case of a non-offload function a constant is created. 2899 Constant *createOutlinedFunctionID(Function *OutlinedFn, 2900 StringRef EntryFnIDName); 2901 2902 // Creates the region entry address for the outlined function 2903 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction, 2904 StringRef EntryFnName); 2905 2906 public: 2907 /// Functions used to generate a function with the given name. 2908 using FunctionGenCallback = 2909 std::function<Expected<Function *>(StringRef FunctionName)>; 2910 2911 /// Create a unique name for the entry function using the source location 2912 /// information of the current target region. The name will be something like: 2913 /// 2914 /// __omp_offloading_DD_FFFF_PP_lBB[_CC] 2915 /// 2916 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the 2917 /// mangled name of the function that encloses the target region and BB is the 2918 /// line number of the target region. CC is a count added when more than one 2919 /// region is located at the same location. 2920 /// 2921 /// If this target outline function is not an offload entry, we don't need to 2922 /// register it. This may happen if it is guarded by an if clause that is 2923 /// false at compile time, or no target archs have been specified. 2924 /// 2925 /// The created target region ID is used by the runtime library to identify 2926 /// the current target region, so it only has to be unique and not 2927 /// necessarily point to anything. It could be the pointer to the outlined 2928 /// function that implements the target region, but we aren't using that so 2929 /// that the compiler doesn't need to keep that, and could therefore inline 2930 /// the host function if proven worthwhile during optimization. In the other 2931 /// hand, if emitting code for the device, the ID has to be the function 2932 /// address so that it can retrieved from the offloading entry and launched 2933 /// by the runtime library. We also mark the outlined function to have 2934 /// external linkage in case we are emitting code for the device, because 2935 /// these functions will be entry points to the device. 2936 /// 2937 /// \param InfoManager The info manager keeping track of the offload entries 2938 /// \param EntryInfo The entry information about the function 2939 /// \param GenerateFunctionCallback The callback function to generate the code 2940 /// \param OutlinedFunction Pointer to the outlined function 2941 /// \param EntryFnIDName Name of the ID o be created 2942 LLVM_ABI Error emitTargetRegionFunction( 2943 TargetRegionEntryInfo &EntryInfo, 2944 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, 2945 Function *&OutlinedFn, Constant *&OutlinedFnID); 2946 2947 /// Registers the given function and sets up the attribtues of the function 2948 /// Returns the FunctionID. 2949 /// 2950 /// \param InfoManager The info manager keeping track of the offload entries 2951 /// \param EntryInfo The entry information about the function 2952 /// \param OutlinedFunction Pointer to the outlined function 2953 /// \param EntryFnName Name of the outlined function 2954 /// \param EntryFnIDName Name of the ID o be created 2955 LLVM_ABI Constant * 2956 registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, 2957 Function *OutlinedFunction, 2958 StringRef EntryFnName, StringRef EntryFnIDName); 2959 2960 /// Type of BodyGen to use for region codegen 2961 /// 2962 /// Priv: If device pointer privatization is required, emit the body of the 2963 /// region here. It will have to be duplicated: with and without 2964 /// privatization. 2965 /// DupNoPriv: If we need device pointer privatization, we need 2966 /// to emit the body of the region with no privatization in the 'else' branch 2967 /// of the conditional. 2968 /// NoPriv: If we don't require privatization of device 2969 /// pointers, we emit the body in between the runtime calls. This avoids 2970 /// duplicating the body code. 2971 enum BodyGenTy { Priv, DupNoPriv, NoPriv }; 2972 2973 /// Callback type for creating the map infos for the kernel parameters. 2974 /// \param CodeGenIP is the insertion point where code should be generated, 2975 /// if any. 2976 using GenMapInfoCallbackTy = 2977 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>; 2978 2979 private: 2980 /// Emit the array initialization or deletion portion for user-defined mapper 2981 /// code generation. First, it evaluates whether an array section is mapped 2982 /// and whether the \a MapType instructs to delete this section. If \a IsInit 2983 /// is true, and \a MapType indicates to not delete this array, array 2984 /// initialization code is generated. If \a IsInit is false, and \a MapType 2985 /// indicates to delete this array, array deletion code is generated. 2986 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle, 2987 llvm::Value *Base, llvm::Value *Begin, 2988 llvm::Value *Size, llvm::Value *MapType, 2989 llvm::Value *MapName, TypeSize ElementSize, 2990 llvm::BasicBlock *ExitBB, bool IsInit); 2991 2992 public: 2993 /// Emit the user-defined mapper function. The code generation follows the 2994 /// pattern in the example below. 2995 /// \code 2996 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle, 2997 /// void *base, void *begin, 2998 /// int64_t size, int64_t type, 2999 /// void *name = nullptr) { 3000 /// // Allocate space for an array section first or add a base/begin for 3001 /// // pointer dereference. 3002 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) && 3003 /// !maptype.IsDelete) 3004 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, 3005 /// size*sizeof(Ty), clearToFromMember(type)); 3006 /// // Map members. 3007 /// for (unsigned i = 0; i < size; i++) { 3008 /// // For each component specified by this mapper: 3009 /// for (auto c : begin[i]->all_components) { 3010 /// if (c.hasMapper()) 3011 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin, 3012 /// c.arg_size, 3013 /// c.arg_type, c.arg_name); 3014 /// else 3015 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base, 3016 /// c.arg_begin, c.arg_size, c.arg_type, 3017 /// c.arg_name); 3018 /// } 3019 /// } 3020 /// // Delete the array section. 3021 /// if (size > 1 && maptype.IsDelete) 3022 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin, 3023 /// size*sizeof(Ty), clearToFromMember(type)); 3024 /// } 3025 /// \endcode 3026 /// 3027 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the 3028 /// MapInfos and returns. 3029 /// \param ElemTy DeclareMapper element type. 3030 /// \param FuncName Optional param to specify mapper function name. 3031 /// \param CustomMapperCB Optional callback to generate code related to 3032 /// custom mappers. 3033 LLVM_ABI Expected<Function *> emitUserDefinedMapper( 3034 function_ref<MapInfosOrErrorTy( 3035 InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> 3036 PrivAndGenMapInfoCB, 3037 llvm::Type *ElemTy, StringRef FuncName, 3038 CustomMapperCallbackTy CustomMapperCB); 3039 3040 /// Generator for '#omp target data' 3041 /// 3042 /// \param Loc The location where the target data construct was encountered. 3043 /// \param AllocaIP The insertion points to be used for alloca instructions. 3044 /// \param CodeGenIP The insertion point at which the target directive code 3045 /// should be placed. 3046 /// \param IsBegin If true then emits begin mapper call otherwise emits 3047 /// end mapper call. 3048 /// \param DeviceID Stores the DeviceID from the device clause. 3049 /// \param IfCond Value which corresponds to the if clause condition. 3050 /// \param Info Stores all information realted to the Target Data directive. 3051 /// \param GenMapInfoCB Callback that populates the MapInfos and returns. 3052 /// \param CustomMapperCB Callback to generate code related to 3053 /// custom mappers. 3054 /// \param BodyGenCB Optional Callback to generate the region code. 3055 /// \param DeviceAddrCB Optional callback to generate code related to 3056 /// use_device_ptr and use_device_addr. 3057 LLVM_ABI InsertPointOrErrorTy createTargetData( 3058 const LocationDescription &Loc, InsertPointTy AllocaIP, 3059 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, 3060 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, 3061 CustomMapperCallbackTy CustomMapperCB, 3062 omp::RuntimeFunction *MapperFunc = nullptr, 3063 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP, 3064 BodyGenTy BodyGenType)> 3065 BodyGenCB = nullptr, 3066 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 3067 Value *SrcLocInfo = nullptr); 3068 3069 using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy( 3070 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 3071 3072 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy( 3073 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, 3074 InsertPointTy CodeGenIP)>; 3075 3076 /// Generator for '#omp target' 3077 /// 3078 /// \param Loc where the target data construct was encountered. 3079 /// \param IsOffloadEntry whether it is an offload entry. 3080 /// \param CodeGenIP The insertion point where the call to the outlined 3081 /// function should be emitted. 3082 /// \param Info Stores all information realted to the Target directive. 3083 /// \param EntryInfo The entry information about the function. 3084 /// \param DefaultAttrs Structure containing the default attributes, including 3085 /// numbers of threads and teams to launch the kernel with. 3086 /// \param RuntimeAttrs Structure containing the runtime numbers of threads 3087 /// and teams to launch the kernel with. 3088 /// \param IfCond value of the `if` clause. 3089 /// \param Inputs The input values to the region that will be passed. 3090 /// as arguments to the outlined function. 3091 /// \param BodyGenCB Callback that will generate the region code. 3092 /// \param ArgAccessorFuncCB Callback that will generate accessors 3093 /// instructions for passed in target arguments where neccessary 3094 /// \param CustomMapperCB Callback to generate code related to 3095 /// custom mappers. 3096 /// \param Dependencies A vector of DependData objects that carry 3097 /// dependency information as passed in the depend clause 3098 /// \param HasNowait Whether the target construct has a `nowait` clause or 3099 /// not. 3100 LLVM_ABI InsertPointOrErrorTy createTarget( 3101 const LocationDescription &Loc, bool IsOffloadEntry, 3102 OpenMPIRBuilder::InsertPointTy AllocaIP, 3103 OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, 3104 TargetRegionEntryInfo &EntryInfo, 3105 const TargetKernelDefaultAttrs &DefaultAttrs, 3106 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, 3107 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB, 3108 TargetBodyGenCallbackTy BodyGenCB, 3109 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, 3110 CustomMapperCallbackTy CustomMapperCB, 3111 const SmallVector<DependData> &Dependencies, bool HasNowait = false); 3112 3113 /// Returns __kmpc_for_static_init_* runtime function for the specified 3114 /// size \a IVSize and sign \a IVSigned. Will create a distribute call 3115 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set. 3116 LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, 3117 bool IVSigned, 3118 bool IsGPUDistribute); 3119 3120 /// Returns __kmpc_dispatch_init_* runtime function for the specified 3121 /// size \a IVSize and sign \a IVSigned. 3122 LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, 3123 bool IVSigned); 3124 3125 /// Returns __kmpc_dispatch_next_* runtime function for the specified 3126 /// size \a IVSize and sign \a IVSigned. 3127 LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, 3128 bool IVSigned); 3129 3130 /// Returns __kmpc_dispatch_fini_* runtime function for the specified 3131 /// size \a IVSize and sign \a IVSigned. 3132 LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, 3133 bool IVSigned); 3134 3135 /// Returns __kmpc_dispatch_deinit runtime function. 3136 LLVM_ABI FunctionCallee createDispatchDeinitFunction(); 3137 3138 /// Declarations for LLVM-IR types (simple, array, function and structure) are 3139 /// generated below. Their names are defined and used in OpenMPKinds.def. Here 3140 /// we provide the declarations, the initializeTypes function will provide the 3141 /// values. 3142 /// 3143 ///{ 3144 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr; 3145 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 3146 ArrayType *VarName##Ty = nullptr; \ 3147 PointerType *VarName##PtrTy = nullptr; 3148 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 3149 FunctionType *VarName = nullptr; \ 3150 PointerType *VarName##Ptr = nullptr; 3151 #define OMP_STRUCT_TYPE(VarName, StrName, ...) \ 3152 StructType *VarName = nullptr; \ 3153 PointerType *VarName##Ptr = nullptr; 3154 #include "llvm/Frontend/OpenMP/OMPKinds.def" 3155 3156 ///} 3157 3158 private: 3159 /// Create all simple and struct types exposed by the runtime and remember 3160 /// the llvm::PointerTypes of them for easy access later. 3161 void initializeTypes(Module &M); 3162 3163 /// Common interface for generating entry calls for OMP Directives. 3164 /// if the directive has a region/body, It will set the insertion 3165 /// point to the body 3166 /// 3167 /// \param OMPD Directive to generate entry blocks for 3168 /// \param EntryCall Call to the entry OMP Runtime Function 3169 /// \param ExitBB block where the region ends. 3170 /// \param Conditional indicate if the entry call result will be used 3171 /// to evaluate a conditional of whether a thread will execute 3172 /// body code or not. 3173 /// 3174 /// \return The insertion position in exit block 3175 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall, 3176 BasicBlock *ExitBB, 3177 bool Conditional = false); 3178 3179 /// Common interface to finalize the region 3180 /// 3181 /// \param OMPD Directive to generate exiting code for 3182 /// \param FinIP Insertion point for emitting Finalization code and exit call 3183 /// \param ExitCall Call to the ending OMP Runtime Function 3184 /// \param HasFinalize indicate if the directive will require finalization 3185 /// and has a finalization callback in the stack that 3186 /// should be called. 3187 /// 3188 /// \return The insertion position in exit block 3189 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD, 3190 InsertPointTy FinIP, 3191 Instruction *ExitCall, 3192 bool HasFinalize = true); 3193 3194 /// Common Interface to generate OMP inlined regions 3195 /// 3196 /// \param OMPD Directive to generate inlined region for 3197 /// \param EntryCall Call to the entry OMP Runtime Function 3198 /// \param ExitCall Call to the ending OMP Runtime Function 3199 /// \param BodyGenCB Body code generation callback. 3200 /// \param FiniCB Finalization Callback. Will be called when finalizing region 3201 /// \param Conditional indicate if the entry call result will be used 3202 /// to evaluate a conditional of whether a thread will execute 3203 /// body code or not. 3204 /// \param HasFinalize indicate if the directive will require finalization 3205 /// and has a finalization callback in the stack that 3206 /// should be called. 3207 /// \param IsCancellable if HasFinalize is set to true, indicate if the 3208 /// the directive should be cancellable. 3209 /// \return The insertion point after the region 3210 InsertPointOrErrorTy 3211 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall, 3212 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB, 3213 FinalizeCallbackTy FiniCB, bool Conditional = false, 3214 bool HasFinalize = true, bool IsCancellable = false); 3215 3216 /// Get the platform-specific name separator. 3217 /// \param Parts different parts of the final name that needs separation 3218 /// \param FirstSeparator First separator used between the initial two 3219 /// parts of the name. 3220 /// \param Separator separator used between all of the rest consecutive 3221 /// parts of the name 3222 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts, 3223 StringRef FirstSeparator, 3224 StringRef Separator); 3225 3226 /// Returns corresponding lock object for the specified critical region 3227 /// name. If the lock object does not exist it is created, otherwise the 3228 /// reference to the existing copy is returned. 3229 /// \param CriticalName Name of the critical region. 3230 /// 3231 Value *getOMPCriticalRegionLock(StringRef CriticalName); 3232 3233 /// Callback type for Atomic Expression update 3234 /// ex: 3235 /// \code{.cpp} 3236 /// unsigned x = 0; 3237 /// #pragma omp atomic update 3238 /// x = Expr(x_old); //Expr() is any legal operation 3239 /// \endcode 3240 /// 3241 /// \param XOld the value of the atomic memory address to use for update 3242 /// \param IRB reference to the IRBuilder to use 3243 /// 3244 /// \returns Value to update X to. 3245 using AtomicUpdateCallbackTy = 3246 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>; 3247 3248 private: 3249 enum AtomicKind { Read, Write, Update, Capture, Compare }; 3250 3251 /// Determine whether to emit flush or not 3252 /// 3253 /// \param Loc The insert and source location description. 3254 /// \param AO The required atomic ordering 3255 /// \param AK The OpenMP atomic operation kind used. 3256 /// 3257 /// \returns wether a flush was emitted or not 3258 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc, 3259 AtomicOrdering AO, AtomicKind AK); 3260 3261 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 3262 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 3263 /// Only Scalar data types. 3264 /// 3265 /// \param AllocaIP The insertion point to be used for alloca 3266 /// instructions. 3267 /// \param X The target atomic pointer to be updated 3268 /// \param XElemTy The element type of the atomic pointer. 3269 /// \param Expr The value to update X with. 3270 /// \param AO Atomic ordering of the generated atomic 3271 /// instructions. 3272 /// \param RMWOp The binary operation used for update. If 3273 /// operation is not supported by atomicRMW, 3274 /// or belong to {FADD, FSUB, BAD_BINOP}. 3275 /// Then a `cmpExch` based atomic will be generated. 3276 /// \param UpdateOp Code generator for complex expressions that cannot be 3277 /// expressed through atomicrmw instruction. 3278 /// \param VolatileX true if \a X volatile? 3279 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3280 /// update expression, false otherwise. 3281 /// (e.g. true for X = X BinOp Expr) 3282 /// 3283 /// \returns A pair of the old value of X before the update, and the value 3284 /// used for the update. 3285 Expected<std::pair<Value *, Value *>> 3286 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 3287 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3288 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, 3289 bool IsXBinopExpr); 3290 3291 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 . 3292 /// 3293 /// \Return The instruction 3294 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2, 3295 AtomicRMWInst::BinOp RMWOp); 3296 3297 bool IsFinalized; 3298 3299 public: 3300 /// a struct to pack relevant information while generating atomic Ops 3301 struct AtomicOpValue { 3302 Value *Var = nullptr; 3303 Type *ElemTy = nullptr; 3304 bool IsSigned = false; 3305 bool IsVolatile = false; 3306 }; 3307 3308 /// Emit atomic Read for : V = X --- Only Scalar data types. 3309 /// 3310 /// \param Loc The insert and source location description. 3311 /// \param X The target pointer to be atomically read 3312 /// \param V Memory address where to store atomically read 3313 /// value 3314 /// \param AO Atomic ordering of the generated atomic 3315 /// instructions. 3316 /// \param AllocaIP Insert point for allocas 3317 // 3318 /// \return Insertion point after generated atomic read IR. 3319 LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, 3320 AtomicOpValue &X, AtomicOpValue &V, 3321 AtomicOrdering AO, 3322 InsertPointTy AllocaIP); 3323 3324 /// Emit atomic write for : X = Expr --- Only Scalar data types. 3325 /// 3326 /// \param Loc The insert and source location description. 3327 /// \param X The target pointer to be atomically written to 3328 /// \param Expr The value to store. 3329 /// \param AO Atomic ordering of the generated atomic 3330 /// instructions. 3331 /// \param AllocaIP Insert point for allocas 3332 /// 3333 /// \return Insertion point after generated atomic Write IR. 3334 LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, 3335 AtomicOpValue &X, Value *Expr, 3336 AtomicOrdering AO, 3337 InsertPointTy AllocaIP); 3338 3339 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 3340 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 3341 /// Only Scalar data types. 3342 /// 3343 /// \param Loc The insert and source location description. 3344 /// \param AllocaIP The insertion point to be used for alloca instructions. 3345 /// \param X The target atomic pointer to be updated 3346 /// \param Expr The value to update X with. 3347 /// \param AO Atomic ordering of the generated atomic instructions. 3348 /// \param RMWOp The binary operation used for update. If operation 3349 /// is not supported by atomicRMW, or belong to 3350 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based 3351 /// atomic will be generated. 3352 /// \param UpdateOp Code generator for complex expressions that cannot be 3353 /// expressed through atomicrmw instruction. 3354 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3355 /// update expression, false otherwise. 3356 /// (e.g. true for X = X BinOp Expr) 3357 /// 3358 /// \return Insertion point after generated atomic update IR. 3359 LLVM_ABI InsertPointOrErrorTy createAtomicUpdate( 3360 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 3361 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3362 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr); 3363 3364 /// Emit atomic update for constructs: --- Only Scalar data types 3365 /// V = X; X = X BinOp Expr , 3366 /// X = X BinOp Expr; V = X, 3367 /// V = X; X = Expr BinOp X, 3368 /// X = Expr BinOp X; V = X, 3369 /// V = X; X = UpdateOp(X), 3370 /// X = UpdateOp(X); V = X, 3371 /// 3372 /// \param Loc The insert and source location description. 3373 /// \param AllocaIP The insertion point to be used for alloca instructions. 3374 /// \param X The target atomic pointer to be updated 3375 /// \param V Memory address where to store captured value 3376 /// \param Expr The value to update X with. 3377 /// \param AO Atomic ordering of the generated atomic instructions 3378 /// \param RMWOp The binary operation used for update. If 3379 /// operation is not supported by atomicRMW, or belong to 3380 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based 3381 /// atomic will be generated. 3382 /// \param UpdateOp Code generator for complex expressions that cannot be 3383 /// expressed through atomicrmw instruction. 3384 /// \param UpdateExpr true if X is an in place update of the form 3385 /// X = X BinOp Expr or X = Expr BinOp X 3386 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the 3387 /// update expression, false otherwise. 3388 /// (e.g. true for X = X BinOp Expr) 3389 /// \param IsPostfixUpdate true if original value of 'x' must be stored in 3390 /// 'v', not an updated one. 3391 /// 3392 /// \return Insertion point after generated atomic capture IR. 3393 LLVM_ABI InsertPointOrErrorTy createAtomicCapture( 3394 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 3395 AtomicOpValue &V, Value *Expr, AtomicOrdering AO, 3396 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, 3397 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr); 3398 3399 /// Emit atomic compare for constructs: --- Only scalar data types 3400 /// cond-expr-stmt: 3401 /// x = x ordop expr ? expr : x; 3402 /// x = expr ordop x ? expr : x; 3403 /// x = x == e ? d : x; 3404 /// x = e == x ? d : x; (this one is not in the spec) 3405 /// cond-update-stmt: 3406 /// if (x ordop expr) { x = expr; } 3407 /// if (expr ordop x) { x = expr; } 3408 /// if (x == e) { x = d; } 3409 /// if (e == x) { x = d; } (this one is not in the spec) 3410 /// conditional-update-capture-atomic: 3411 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false) 3412 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false) 3413 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3414 /// IsFailOnly=true) 3415 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false) 3416 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3417 /// IsFailOnly=true) 3418 /// 3419 /// \param Loc The insert and source location description. 3420 /// \param X The target atomic pointer to be updated. 3421 /// \param V Memory address where to store captured value (for 3422 /// compare capture only). 3423 /// \param R Memory address where to store comparison result 3424 /// (for compare capture with '==' only). 3425 /// \param E The expected value ('e') for forms that use an 3426 /// equality comparison or an expression ('expr') for 3427 /// forms that use 'ordop' (logically an atomic maximum or 3428 /// minimum). 3429 /// \param D The desired value for forms that use an equality 3430 /// comparison. If forms that use 'ordop', it should be 3431 /// \p nullptr. 3432 /// \param AO Atomic ordering of the generated atomic instructions. 3433 /// \param Op Atomic compare operation. It can only be ==, <, or >. 3434 /// \param IsXBinopExpr True if the conditional statement is in the form where 3435 /// x is on LHS. It only matters for < or >. 3436 /// \param IsPostfixUpdate True if original value of 'x' must be stored in 3437 /// 'v', not an updated one (for compare capture 3438 /// only). 3439 /// \param IsFailOnly True if the original value of 'x' is stored to 'v' 3440 /// only when the comparison fails. This is only valid for 3441 /// the case the comparison is '=='. 3442 /// 3443 /// \return Insertion point after generated atomic capture IR. 3444 LLVM_ABI InsertPointTy 3445 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, 3446 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, 3447 AtomicOrdering AO, omp::OMPAtomicCompareOp Op, 3448 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly); 3449 LLVM_ABI InsertPointTy createAtomicCompare( 3450 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 3451 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 3452 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 3453 bool IsFailOnly, AtomicOrdering Failure); 3454 3455 /// Create the control flow structure of a canonical OpenMP loop. 3456 /// 3457 /// The emitted loop will be disconnected, i.e. no edge to the loop's 3458 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's 3459 /// IRBuilder location is not preserved. 3460 /// 3461 /// \param DL DebugLoc used for the instructions in the skeleton. 3462 /// \param TripCount Value to be used for the trip count. 3463 /// \param F Function in which to insert the BasicBlocks. 3464 /// \param PreInsertBefore Where to insert BBs that execute before the body, 3465 /// typically the body itself. 3466 /// \param PostInsertBefore Where to insert BBs that execute after the body. 3467 /// \param Name Base name used to derive BB 3468 /// and instruction names. 3469 /// 3470 /// \returns The CanonicalLoopInfo that represents the emitted loop. 3471 LLVM_ABI CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount, 3472 Function *F, 3473 BasicBlock *PreInsertBefore, 3474 BasicBlock *PostInsertBefore, 3475 const Twine &Name = {}); 3476 /// OMP Offload Info Metadata name string 3477 const std::string ompOffloadInfoName = "omp_offload.info"; 3478 3479 /// Loads all the offload entries information from the host IR 3480 /// metadata. This function is only meant to be used with device code 3481 /// generation. 3482 /// 3483 /// \param M Module to load Metadata info from. Module passed maybe 3484 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module. 3485 LLVM_ABI void loadOffloadInfoMetadata(Module &M); 3486 3487 /// Loads all the offload entries information from the host IR 3488 /// metadata read from the file passed in as the HostFilePath argument. This 3489 /// function is only meant to be used with device code generation. 3490 /// 3491 /// \param HostFilePath The path to the host IR file, 3492 /// used to load in offload metadata for the device, allowing host and device 3493 /// to maintain the same metadata mapping. 3494 LLVM_ABI void loadOffloadInfoMetadata(StringRef HostFilePath); 3495 3496 /// Gets (if variable with the given name already exist) or creates 3497 /// internal global variable with the specified Name. The created variable has 3498 /// linkage CommonLinkage by default and is initialized by null value. 3499 /// \param Ty Type of the global variable. If it is exist already the type 3500 /// must be the same. 3501 /// \param Name Name of the variable. 3502 LLVM_ABI GlobalVariable * 3503 getOrCreateInternalVariable(Type *Ty, const StringRef &Name, 3504 unsigned AddressSpace = 0); 3505 }; 3506 3507 /// Class to represented the control flow structure of an OpenMP canonical loop. 3508 /// 3509 /// The control-flow structure is standardized for easy consumption by 3510 /// directives associated with loops. For instance, the worksharing-loop 3511 /// construct may change this control flow such that each loop iteration is 3512 /// executed on only one thread. The constraints of a canonical loop in brief 3513 /// are: 3514 /// 3515 /// * The number of loop iterations must have been computed before entering the 3516 /// loop. 3517 /// 3518 /// * Has an (unsigned) logical induction variable that starts at zero and 3519 /// increments by one. 3520 /// 3521 /// * The loop's CFG itself has no side-effects. The OpenMP specification 3522 /// itself allows side-effects, but the order in which they happen, including 3523 /// how often or whether at all, is unspecified. We expect that the frontend 3524 /// will emit those side-effect instructions somewhere (e.g. before the loop) 3525 /// such that the CanonicalLoopInfo itself can be side-effect free. 3526 /// 3527 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated 3528 /// execution of a loop body that satifies these constraints. It does NOT 3529 /// represent arbitrary SESE regions that happen to contain a loop. Do not use 3530 /// CanonicalLoopInfo for such purposes. 3531 /// 3532 /// The control flow can be described as follows: 3533 /// 3534 /// Preheader 3535 /// | 3536 /// /-> Header 3537 /// | | 3538 /// | Cond---\ 3539 /// | | | 3540 /// | Body | 3541 /// | | | | 3542 /// | <...> | 3543 /// | | | | 3544 /// \--Latch | 3545 /// | 3546 /// Exit 3547 /// | 3548 /// After 3549 /// 3550 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator, 3551 /// including) and end at AfterIP (at the After's first instruction, excluding). 3552 /// That is, instructions in the Preheader and After blocks (except the 3553 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have 3554 /// side-effects. Typically, the Preheader is used to compute the loop's trip 3555 /// count. The instructions from BodyIP (at the Body block's first instruction, 3556 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's 3557 /// control and thus can have side-effects. The body block is the single entry 3558 /// point into the loop body, which may contain arbitrary control flow as long 3559 /// as all control paths eventually branch to the Latch block. 3560 /// 3561 /// TODO: Consider adding another standardized BasicBlock between Body CFG and 3562 /// Latch to guarantee that there is only a single edge to the latch. It would 3563 /// make loop transformations easier to not needing to consider multiple 3564 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us 3565 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that 3566 /// executes after each body iteration. 3567 /// 3568 /// There must be no loop-carried dependencies through llvm::Values. This is 3569 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is 3570 /// for the induction variable. 3571 /// 3572 /// All code in Header, Cond, Latch and Exit (plus the terminator of the 3573 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked 3574 /// by assertOK(). They are expected to not be modified unless explicitly 3575 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP 3576 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop, 3577 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its 3578 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used 3579 /// anymore as its underlying control flow may not exist anymore. 3580 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop 3581 /// may also return a new CanonicalLoopInfo that can be passed to other 3582 /// loop-associated construct implementing methods. These loop-transforming 3583 /// methods may either create a new CanonicalLoopInfo usually using 3584 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and 3585 /// modify one of the input CanonicalLoopInfo and return it as representing the 3586 /// modified loop. What is done is an implementation detail of 3587 /// transformation-implementing method and callers should always assume that the 3588 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned. 3589 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one 3590 /// created by createCanonicalLoop, such that transforming methods do not have 3591 /// to special case where the CanonicalLoopInfo originated from. 3592 /// 3593 /// Generally, methods consuming CanonicalLoopInfo do not need an 3594 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the 3595 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless 3596 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate 3597 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically, 3598 /// any InsertPoint in the Preheader, After or Block can still be used after 3599 /// calling such a method. 3600 /// 3601 /// TODO: Provide mechanisms for exception handling and cancellation points. 3602 /// 3603 /// Defined outside OpenMPIRBuilder because nested classes cannot be 3604 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h. 3605 class CanonicalLoopInfo { 3606 friend class OpenMPIRBuilder; 3607 3608 private: 3609 BasicBlock *Header = nullptr; 3610 BasicBlock *Cond = nullptr; 3611 BasicBlock *Latch = nullptr; 3612 BasicBlock *Exit = nullptr; 3613 3614 // Hold the MLIR value for the `lastiter` of the canonical loop. 3615 Value *LastIter = nullptr; 3616 3617 /// Add the control blocks of this loop to \p BBs. 3618 /// 3619 /// This does not include any block from the body, including the one returned 3620 /// by getBody(). 3621 /// 3622 /// FIXME: This currently includes the Preheader and After blocks even though 3623 /// their content is (mostly) not under CanonicalLoopInfo's control. 3624 /// Re-evaluated whether this makes sense. 3625 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs); 3626 3627 /// Sets the number of loop iterations to the given value. This value must be 3628 /// valid in the condition block (i.e., defined in the preheader) and is 3629 /// interpreted as an unsigned integer. 3630 void setTripCount(Value *TripCount); 3631 3632 /// Replace all uses of the canonical induction variable in the loop body with 3633 /// a new one. 3634 /// 3635 /// The intended use case is to update the induction variable for an updated 3636 /// iteration space such that it can stay normalized in the 0...tripcount-1 3637 /// range. 3638 /// 3639 /// The \p Updater is called with the (presumable updated) current normalized 3640 /// induction variable and is expected to return the value that uses of the 3641 /// pre-updated induction values should use instead, typically dependent on 3642 /// the new induction variable. This is a lambda (instead of e.g. just passing 3643 /// the new value) to be able to distinguish the uses of the pre-updated 3644 /// induction variable and uses of the induction varible to compute the 3645 /// updated induction variable value. 3646 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater); 3647 3648 public: 3649 /// Sets the last iteration variable for this loop. setLastIter(Value * IterVar)3650 void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); } 3651 3652 /// Returns the last iteration variable for this loop. 3653 /// Certain use-cases (like translation of linear clause) may access 3654 /// this variable even after a loop transformation. Hence, do not guard 3655 /// this getter function by `isValid`. It is the responsibility of the 3656 /// callee to ensure this functionality is not invoked by a non-outlined 3657 /// CanonicalLoopInfo object (in which case, `setLastIter` will never be 3658 /// invoked and `LastIter` will be by default `nullptr`). getLastIter()3659 Value *getLastIter() { return LastIter; } 3660 3661 /// Returns whether this object currently represents the IR of a loop. If 3662 /// returning false, it may have been consumed by a loop transformation or not 3663 /// been intialized. Do not use in this case; isValid()3664 bool isValid() const { return Header; } 3665 3666 /// The preheader ensures that there is only a single edge entering the loop. 3667 /// Code that must be execute before any loop iteration can be emitted here, 3668 /// such as computing the loop trip count and begin lifetime markers. Code in 3669 /// the preheader is not considered part of the canonical loop. 3670 LLVM_ABI BasicBlock *getPreheader() const; 3671 3672 /// The header is the entry for each iteration. In the canonical control flow, 3673 /// it only contains the PHINode for the induction variable. getHeader()3674 BasicBlock *getHeader() const { 3675 assert(isValid() && "Requires a valid canonical loop"); 3676 return Header; 3677 } 3678 3679 /// The condition block computes whether there is another loop iteration. If 3680 /// yes, branches to the body; otherwise to the exit block. getCond()3681 BasicBlock *getCond() const { 3682 assert(isValid() && "Requires a valid canonical loop"); 3683 return Cond; 3684 } 3685 3686 /// The body block is the single entry for a loop iteration and not controlled 3687 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must 3688 /// eventually branch to the \p Latch block. getBody()3689 BasicBlock *getBody() const { 3690 assert(isValid() && "Requires a valid canonical loop"); 3691 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0); 3692 } 3693 3694 /// Reaching the latch indicates the end of the loop body code. In the 3695 /// canonical control flow, it only contains the increment of the induction 3696 /// variable. getLatch()3697 BasicBlock *getLatch() const { 3698 assert(isValid() && "Requires a valid canonical loop"); 3699 return Latch; 3700 } 3701 3702 /// Reaching the exit indicates no more iterations are being executed. getExit()3703 BasicBlock *getExit() const { 3704 assert(isValid() && "Requires a valid canonical loop"); 3705 return Exit; 3706 } 3707 3708 /// The after block is intended for clean-up code such as lifetime end 3709 /// markers. It is separate from the exit block to ensure, analogous to the 3710 /// preheader, it having just a single entry edge and being free from PHI 3711 /// nodes should there be multiple loop exits (such as from break 3712 /// statements/cancellations). getAfter()3713 BasicBlock *getAfter() const { 3714 assert(isValid() && "Requires a valid canonical loop"); 3715 return Exit->getSingleSuccessor(); 3716 } 3717 3718 /// Returns the llvm::Value containing the number of loop iterations. It must 3719 /// be valid in the preheader and always interpreted as an unsigned integer of 3720 /// any bit-width. getTripCount()3721 Value *getTripCount() const { 3722 assert(isValid() && "Requires a valid canonical loop"); 3723 Instruction *CmpI = &Cond->front(); 3724 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 3725 return CmpI->getOperand(1); 3726 } 3727 3728 /// Returns the instruction representing the current logical induction 3729 /// variable. Always unsigned, always starting at 0 with an increment of one. getIndVar()3730 Instruction *getIndVar() const { 3731 assert(isValid() && "Requires a valid canonical loop"); 3732 Instruction *IndVarPHI = &Header->front(); 3733 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI"); 3734 return IndVarPHI; 3735 } 3736 3737 /// Return the type of the induction variable (and the trip count). getIndVarType()3738 Type *getIndVarType() const { 3739 assert(isValid() && "Requires a valid canonical loop"); 3740 return getIndVar()->getType(); 3741 } 3742 3743 /// Return the insertion point for user code before the loop. getPreheaderIP()3744 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const { 3745 assert(isValid() && "Requires a valid canonical loop"); 3746 BasicBlock *Preheader = getPreheader(); 3747 return {Preheader, std::prev(Preheader->end())}; 3748 }; 3749 3750 /// Return the insertion point for user code in the body. getBodyIP()3751 OpenMPIRBuilder::InsertPointTy getBodyIP() const { 3752 assert(isValid() && "Requires a valid canonical loop"); 3753 BasicBlock *Body = getBody(); 3754 return {Body, Body->begin()}; 3755 }; 3756 3757 /// Return the insertion point for user code after the loop. getAfterIP()3758 OpenMPIRBuilder::InsertPointTy getAfterIP() const { 3759 assert(isValid() && "Requires a valid canonical loop"); 3760 BasicBlock *After = getAfter(); 3761 return {After, After->begin()}; 3762 }; 3763 getFunction()3764 Function *getFunction() const { 3765 assert(isValid() && "Requires a valid canonical loop"); 3766 return Header->getParent(); 3767 } 3768 3769 /// Consistency self-check. 3770 LLVM_ABI void assertOK() const; 3771 3772 /// Invalidate this loop. That is, the underlying IR does not fulfill the 3773 /// requirements of an OpenMP canonical loop anymore. 3774 LLVM_ABI void invalidate(); 3775 }; 3776 3777 } // end namespace llvm 3778 3779 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 3780