1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient 10 // way to create LLVM instructions for OpenMP directives. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 16 17 #include "llvm/Analysis/MemorySSAUpdater.h" 18 #include "llvm/Frontend/OpenMP/OMPConstants.h" 19 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 20 #include "llvm/IR/DebugLoc.h" 21 #include "llvm/IR/IRBuilder.h" 22 #include "llvm/IR/Module.h" 23 #include "llvm/Support/Allocator.h" 24 #include "llvm/TargetParser/Triple.h" 25 #include <forward_list> 26 #include <map> 27 #include <optional> 28 29 namespace llvm { 30 class CanonicalLoopInfo; 31 struct TargetRegionEntryInfo; 32 class OffloadEntriesInfoManager; 33 class OpenMPIRBuilder; 34 35 /// Move the instruction after an InsertPoint to the beginning of another 36 /// BasicBlock. 37 /// 38 /// The instructions after \p IP are moved to the beginning of \p New which must 39 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to 40 /// \p New will be added such that there is no semantic change. Otherwise, the 41 /// \p IP insert block remains degenerate and it is up to the caller to insert a 42 /// terminator. 43 void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 44 bool CreateBranch); 45 46 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new 47 /// insert location will stick to after the instruction before the insertion 48 /// point (instead of moving with the instruction the InsertPoint stores 49 /// internally). 50 void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch); 51 52 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate 53 /// (missing the terminator). 54 /// 55 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed 56 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch 57 /// is true, a branch to the new successor will new created such that 58 /// semantically there is no change; otherwise the block of the insertion point 59 /// remains degenerate and it is the caller's responsibility to insert a 60 /// terminator. Returns the new successor block. 61 BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 62 llvm::Twine Name = {}); 63 64 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 65 /// degenerate (missing the terminator). Its new insert location will stick to 66 /// after the instruction before the insertion point (instead of moving with the 67 /// instruction the InsertPoint stores internally). 68 BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch, 69 llvm::Twine Name = {}); 70 71 /// Split a BasicBlock at \p Builder's insertion point, even if the block is 72 /// degenerate (missing the terminator). Its new insert location will stick to 73 /// after the instruction before the insertion point (instead of moving with the 74 /// instruction the InsertPoint stores internally). 75 BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name); 76 77 /// Like splitBB, but reuses the current block's name for the new name. 78 BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, 79 llvm::Twine Suffix = ".split"); 80 81 /// Captures attributes that affect generating LLVM-IR using the 82 /// OpenMPIRBuilder and related classes. Note that not all attributes are 83 /// required for all classes or functions. In some use cases the configuration 84 /// is not necessary at all, because because the only functions that are called 85 /// are ones that are not dependent on the configuration. 86 class OpenMPIRBuilderConfig { 87 public: 88 /// Flag to define whether to generate code for the role of the OpenMP host 89 /// (if set to false) or device (if set to true) in an offloading context. It 90 /// is set when the -fopenmp-is-target-device compiler frontend option is 91 /// specified. 92 std::optional<bool> IsTargetDevice; 93 94 /// Flag for specifying if the compilation is done for an accelerator. It is 95 /// set according to the architecture of the target triple and currently only 96 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform 97 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true 98 /// if `IsGPU` is true. This restriction might be lifted if an accelerator- 99 /// like target with the ability to work as the OpenMP host is added, or if 100 /// the capabilities of the currently supported GPU architectures are 101 /// expanded. 102 std::optional<bool> IsGPU; 103 104 /// Flag for specifying if LLVMUsed information should be emitted. 105 std::optional<bool> EmitLLVMUsedMetaInfo; 106 107 /// Flag for specifying if offloading is mandatory. 108 std::optional<bool> OpenMPOffloadMandatory; 109 110 /// First separator used between the initial two parts of a name. 111 std::optional<StringRef> FirstSeparator; 112 /// Separator used between all of the rest consecutive parts of s name 113 std::optional<StringRef> Separator; 114 115 // Grid Value for the GPU target 116 std::optional<omp::GV> GridValue; 117 118 OpenMPIRBuilderConfig(); 119 OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU, 120 bool OpenMPOffloadMandatory, 121 bool HasRequiresReverseOffload, 122 bool HasRequiresUnifiedAddress, 123 bool HasRequiresUnifiedSharedMemory, 124 bool HasRequiresDynamicAllocators); 125 126 // Getters functions that assert if the required values are not present. isTargetDevice()127 bool isTargetDevice() const { 128 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set"); 129 return *IsTargetDevice; 130 } 131 isGPU()132 bool isGPU() const { 133 assert(IsGPU.has_value() && "IsGPU is not set"); 134 return *IsGPU; 135 } 136 openMPOffloadMandatory()137 bool openMPOffloadMandatory() const { 138 assert(OpenMPOffloadMandatory.has_value() && 139 "OpenMPOffloadMandatory is not set"); 140 return *OpenMPOffloadMandatory; 141 } 142 getGridValue()143 omp::GV getGridValue() const { 144 assert(GridValue.has_value() && "GridValue is not set"); 145 return *GridValue; 146 } 147 hasRequiresFlags()148 bool hasRequiresFlags() const { return RequiresFlags; } 149 bool hasRequiresReverseOffload() const; 150 bool hasRequiresUnifiedAddress() const; 151 bool hasRequiresUnifiedSharedMemory() const; 152 bool hasRequiresDynamicAllocators() const; 153 154 /// Returns requires directive clauses as flags compatible with those expected 155 /// by libomptarget. 156 int64_t getRequiresFlags() const; 157 158 // Returns the FirstSeparator if set, otherwise use the default separator 159 // depending on isGPU firstSeparator()160 StringRef firstSeparator() const { 161 if (FirstSeparator.has_value()) 162 return *FirstSeparator; 163 if (isGPU()) 164 return "_"; 165 return "."; 166 } 167 168 // Returns the Separator if set, otherwise use the default separator depending 169 // on isGPU separator()170 StringRef separator() const { 171 if (Separator.has_value()) 172 return *Separator; 173 if (isGPU()) 174 return "$"; 175 return "."; 176 } 177 setIsTargetDevice(bool Value)178 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; } setIsGPU(bool Value)179 void setIsGPU(bool Value) { IsGPU = Value; } 180 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; } setOpenMPOffloadMandatory(bool Value)181 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; } setFirstSeparator(StringRef FS)182 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; } setSeparator(StringRef S)183 void setSeparator(StringRef S) { Separator = S; } setGridValue(omp::GV G)184 void setGridValue(omp::GV G) { GridValue = G; } 185 186 void setHasRequiresReverseOffload(bool Value); 187 void setHasRequiresUnifiedAddress(bool Value); 188 void setHasRequiresUnifiedSharedMemory(bool Value); 189 void setHasRequiresDynamicAllocators(bool Value); 190 191 private: 192 /// Flags for specifying which requires directive clauses are present. 193 int64_t RequiresFlags; 194 }; 195 196 /// Data structure to contain the information needed to uniquely identify 197 /// a target entry. 198 struct TargetRegionEntryInfo { 199 std::string ParentName; 200 unsigned DeviceID; 201 unsigned FileID; 202 unsigned Line; 203 unsigned Count; 204 TargetRegionEntryInfoTargetRegionEntryInfo205 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {} 206 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, 207 unsigned FileID, unsigned Line, unsigned Count = 0) ParentNameTargetRegionEntryInfo208 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line), 209 Count(Count) {} 210 211 static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, 212 StringRef ParentName, 213 unsigned DeviceID, unsigned FileID, 214 unsigned Line, unsigned Count); 215 216 bool operator<(const TargetRegionEntryInfo &RHS) const { 217 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) < 218 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line, 219 RHS.Count); 220 } 221 }; 222 223 /// Class that manages information about offload code regions and data 224 class OffloadEntriesInfoManager { 225 /// Number of entries registered so far. 226 OpenMPIRBuilder *OMPBuilder; 227 unsigned OffloadingEntriesNum = 0; 228 229 public: 230 /// Base class of the entries info. 231 class OffloadEntryInfo { 232 public: 233 /// Kind of a given entry. 234 enum OffloadingEntryInfoKinds : unsigned { 235 /// Entry is a target region. 236 OffloadingEntryInfoTargetRegion = 0, 237 /// Entry is a declare target variable. 238 OffloadingEntryInfoDeviceGlobalVar = 1, 239 /// Invalid entry info. 240 OffloadingEntryInfoInvalid = ~0u 241 }; 242 243 protected: 244 OffloadEntryInfo() = delete; OffloadEntryInfo(OffloadingEntryInfoKinds Kind)245 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {} OffloadEntryInfo(OffloadingEntryInfoKinds Kind,unsigned Order,uint32_t Flags)246 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, 247 uint32_t Flags) 248 : Flags(Flags), Order(Order), Kind(Kind) {} 249 ~OffloadEntryInfo() = default; 250 251 public: isValid()252 bool isValid() const { return Order != ~0u; } getOrder()253 unsigned getOrder() const { return Order; } getKind()254 OffloadingEntryInfoKinds getKind() const { return Kind; } getFlags()255 uint32_t getFlags() const { return Flags; } setFlags(uint32_t NewFlags)256 void setFlags(uint32_t NewFlags) { Flags = NewFlags; } getAddress()257 Constant *getAddress() const { return cast_or_null<Constant>(Addr); } setAddress(Constant * V)258 void setAddress(Constant *V) { 259 assert(!Addr.pointsToAliveValue() && "Address has been set before!"); 260 Addr = V; 261 } classof(const OffloadEntryInfo * Info)262 static bool classof(const OffloadEntryInfo *Info) { return true; } 263 264 private: 265 /// Address of the entity that has to be mapped for offloading. 266 WeakTrackingVH Addr; 267 268 /// Flags associated with the device global. 269 uint32_t Flags = 0u; 270 271 /// Order this entry was emitted. 272 unsigned Order = ~0u; 273 274 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid; 275 }; 276 277 /// Return true if a there are no entries defined. 278 bool empty() const; 279 /// Return number of entries defined so far. size()280 unsigned size() const { return OffloadingEntriesNum; } 281 OffloadEntriesInfoManager(OpenMPIRBuilder * builder)282 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {} 283 284 // 285 // Target region entries related. 286 // 287 288 /// Kind of the target registry entry. 289 enum OMPTargetRegionEntryKind : uint32_t { 290 /// Mark the entry as target region. 291 OMPTargetRegionEntryTargetRegion = 0x0, 292 }; 293 294 /// Target region entries info. 295 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo { 296 /// Address that can be used as the ID of the entry. 297 Constant *ID = nullptr; 298 299 public: OffloadEntryInfoTargetRegion()300 OffloadEntryInfoTargetRegion() 301 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {} OffloadEntryInfoTargetRegion(unsigned Order,Constant * Addr,Constant * ID,OMPTargetRegionEntryKind Flags)302 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, 303 Constant *ID, 304 OMPTargetRegionEntryKind Flags) 305 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags), 306 ID(ID) { 307 setAddress(Addr); 308 } 309 getID()310 Constant *getID() const { return ID; } setID(Constant * V)311 void setID(Constant *V) { 312 assert(!ID && "ID has been set before!"); 313 ID = V; 314 } classof(const OffloadEntryInfo * Info)315 static bool classof(const OffloadEntryInfo *Info) { 316 return Info->getKind() == OffloadingEntryInfoTargetRegion; 317 } 318 }; 319 320 /// Initialize target region entry. 321 /// This is ONLY needed for DEVICE compilation. 322 void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, 323 unsigned Order); 324 /// Register target region entry. 325 void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 326 Constant *Addr, Constant *ID, 327 OMPTargetRegionEntryKind Flags); 328 /// Return true if a target region entry with the provided information 329 /// exists. 330 bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, 331 bool IgnoreAddressId = false) const; 332 333 // Return the Name based on \a EntryInfo using the next available Count. 334 void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, 335 const TargetRegionEntryInfo &EntryInfo); 336 337 /// brief Applies action \a Action on all registered entries. 338 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo, 339 const OffloadEntryInfoTargetRegion &)> 340 OffloadTargetRegionEntryInfoActTy; 341 void 342 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action); 343 344 // 345 // Device global variable entries related. 346 // 347 348 /// Kind of the global variable entry.. 349 enum OMPTargetGlobalVarEntryKind : uint32_t { 350 /// Mark the entry as a to declare target. 351 OMPTargetGlobalVarEntryTo = 0x0, 352 /// Mark the entry as a to declare target link. 353 OMPTargetGlobalVarEntryLink = 0x1, 354 /// Mark the entry as a declare target enter. 355 OMPTargetGlobalVarEntryEnter = 0x2, 356 /// Mark the entry as having no declare target entry kind. 357 OMPTargetGlobalVarEntryNone = 0x3, 358 /// Mark the entry as a declare target indirect global. 359 OMPTargetGlobalVarEntryIndirect = 0x8, 360 /// Mark the entry as a register requires global. 361 OMPTargetGlobalRegisterRequires = 0x10, 362 }; 363 364 /// Kind of device clause for declare target variables 365 /// and functions 366 /// NOTE: Currently not used as a part of a variable entry 367 /// used for Flang and Clang to interface with the variable 368 /// related registration functions 369 enum OMPTargetDeviceClauseKind : uint32_t { 370 /// The target is marked for all devices 371 OMPTargetDeviceClauseAny = 0x0, 372 /// The target is marked for non-host devices 373 OMPTargetDeviceClauseNoHost = 0x1, 374 /// The target is marked for host devices 375 OMPTargetDeviceClauseHost = 0x2, 376 /// The target is marked as having no clause 377 OMPTargetDeviceClauseNone = 0x3 378 }; 379 380 /// Device global variable entries info. 381 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo { 382 /// Type of the global variable. 383 int64_t VarSize; 384 GlobalValue::LinkageTypes Linkage; 385 const std::string VarName; 386 387 public: OffloadEntryInfoDeviceGlobalVar()388 OffloadEntryInfoDeviceGlobalVar() 389 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {} OffloadEntryInfoDeviceGlobalVar(unsigned Order,OMPTargetGlobalVarEntryKind Flags)390 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, 391 OMPTargetGlobalVarEntryKind Flags) 392 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {} OffloadEntryInfoDeviceGlobalVar(unsigned Order,Constant * Addr,int64_t VarSize,OMPTargetGlobalVarEntryKind Flags,GlobalValue::LinkageTypes Linkage,const std::string & VarName)393 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, 394 int64_t VarSize, 395 OMPTargetGlobalVarEntryKind Flags, 396 GlobalValue::LinkageTypes Linkage, 397 const std::string &VarName) 398 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags), 399 VarSize(VarSize), Linkage(Linkage), VarName(VarName) { 400 setAddress(Addr); 401 } 402 getVarSize()403 int64_t getVarSize() const { return VarSize; } getVarName()404 StringRef getVarName() const { return VarName; } setVarSize(int64_t Size)405 void setVarSize(int64_t Size) { VarSize = Size; } getLinkage()406 GlobalValue::LinkageTypes getLinkage() const { return Linkage; } setLinkage(GlobalValue::LinkageTypes LT)407 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; } classof(const OffloadEntryInfo * Info)408 static bool classof(const OffloadEntryInfo *Info) { 409 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar; 410 } 411 }; 412 413 /// Initialize device global variable entry. 414 /// This is ONLY used for DEVICE compilation. 415 void initializeDeviceGlobalVarEntryInfo(StringRef Name, 416 OMPTargetGlobalVarEntryKind Flags, 417 unsigned Order); 418 419 /// Register device global variable entry. 420 void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, 421 int64_t VarSize, 422 OMPTargetGlobalVarEntryKind Flags, 423 GlobalValue::LinkageTypes Linkage); 424 /// Checks if the variable with the given name has been registered already. hasDeviceGlobalVarEntryInfo(StringRef VarName)425 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const { 426 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0; 427 } 428 /// Applies action \a Action on all registered entries. 429 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> 430 OffloadDeviceGlobalVarEntryInfoActTy; 431 void actOnDeviceGlobalVarEntriesInfo( 432 const OffloadDeviceGlobalVarEntryInfoActTy &Action); 433 434 private: 435 /// Return the count of entries at a particular source location. 436 unsigned 437 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const; 438 439 /// Update the count of entries at a particular source location. 440 void 441 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo); 442 443 static TargetRegionEntryInfo getTargetRegionEntryCountKey(const TargetRegionEntryInfo & EntryInfo)444 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) { 445 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID, 446 EntryInfo.FileID, EntryInfo.Line, 0); 447 } 448 449 // Count of entries at a location. 450 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount; 451 452 // Storage for target region entries kind. 453 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion> 454 OffloadEntriesTargetRegionTy; 455 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion; 456 /// Storage for device global variable entries kind. The storage is to be 457 /// indexed by mangled name. 458 typedef StringMap<OffloadEntryInfoDeviceGlobalVar> 459 OffloadEntriesDeviceGlobalVarTy; 460 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar; 461 }; 462 463 /// An interface to create LLVM-IR for OpenMP directives. 464 /// 465 /// Each OpenMP directive has a corresponding public generator method. 466 class OpenMPIRBuilder { 467 public: 468 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will 469 /// not have an effect on \p M (see initialize) OpenMPIRBuilder(Module & M)470 OpenMPIRBuilder(Module &M) 471 : M(M), Builder(M.getContext()), OffloadInfoManager(this), 472 T(Triple(M.getTargetTriple())) {} 473 ~OpenMPIRBuilder(); 474 475 /// Initialize the internal state, this will put structures types and 476 /// potentially other helpers into the underlying module. Must be called 477 /// before any other method and only once! This internal state includes types 478 /// used in the OpenMPIRBuilder generated from OMPKinds.def. 479 void initialize(); 480 setConfig(OpenMPIRBuilderConfig C)481 void setConfig(OpenMPIRBuilderConfig C) { Config = C; } 482 483 /// Finalize the underlying module, e.g., by outlining regions. 484 /// \param Fn The function to be finalized. If not used, 485 /// all functions are finalized. 486 void finalize(Function *Fn = nullptr); 487 488 /// Add attributes known for \p FnID to \p Fn. 489 void addAttributes(omp::RuntimeFunction FnID, Function &Fn); 490 491 /// Type used throughout for insertion points. 492 using InsertPointTy = IRBuilder<>::InsertPoint; 493 494 /// Get the create a name using the platform specific separators. 495 /// \param Parts parts of the final name that needs separation 496 /// The created name has a first separator between the first and second part 497 /// and a second separator between all other parts. 498 /// E.g. with FirstSeparator "$" and Separator "." and 499 /// parts: "p1", "p2", "p3", "p4" 500 /// The resulting name is "p1$p2.p3.p4" 501 /// The separators are retrieved from the OpenMPIRBuilderConfig. 502 std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const; 503 504 /// Callback type for variable finalization (think destructors). 505 /// 506 /// \param CodeGenIP is the insertion point at which the finalization code 507 /// should be placed. 508 /// 509 /// A finalize callback knows about all objects that need finalization, e.g. 510 /// destruction, when the scope of the currently generated construct is left 511 /// at the time, and location, the callback is invoked. 512 using FinalizeCallbackTy = std::function<void(InsertPointTy CodeGenIP)>; 513 514 struct FinalizationInfo { 515 /// The finalization callback provided by the last in-flight invocation of 516 /// createXXXX for the directive of kind DK. 517 FinalizeCallbackTy FiniCB; 518 519 /// The directive kind of the innermost directive that has an associated 520 /// region which might require finalization when it is left. 521 omp::Directive DK; 522 523 /// Flag to indicate if the directive is cancellable. 524 bool IsCancellable; 525 }; 526 527 /// Push a finalization callback on the finalization stack. 528 /// 529 /// NOTE: Temporary solution until Clang CG is gone. pushFinalizationCB(const FinalizationInfo & FI)530 void pushFinalizationCB(const FinalizationInfo &FI) { 531 FinalizationStack.push_back(FI); 532 } 533 534 /// Pop the last finalization callback from the finalization stack. 535 /// 536 /// NOTE: Temporary solution until Clang CG is gone. popFinalizationCB()537 void popFinalizationCB() { FinalizationStack.pop_back(); } 538 539 /// Callback type for body (=inner region) code generation 540 /// 541 /// The callback takes code locations as arguments, each describing a 542 /// location where additional instructions can be inserted. 543 /// 544 /// The CodeGenIP may be in the middle of a basic block or point to the end of 545 /// it. The basic block may have a terminator or be degenerate. The callback 546 /// function may just insert instructions at that position, but also split the 547 /// block (without the Before argument of BasicBlock::splitBasicBlock such 548 /// that the identify of the split predecessor block is preserved) and insert 549 /// additional control flow, including branches that do not lead back to what 550 /// follows the CodeGenIP. Note that since the callback is allowed to split 551 /// the block, callers must assume that InsertPoints to positions in the 552 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If 553 /// such InsertPoints need to be preserved, it can split the block itself 554 /// before calling the callback. 555 /// 556 /// AllocaIP and CodeGenIP must not point to the same position. 557 /// 558 /// \param AllocaIP is the insertion point at which new alloca instructions 559 /// should be placed. The BasicBlock it is pointing to must 560 /// not be split. 561 /// \param CodeGenIP is the insertion point at which the body code should be 562 /// placed. 563 using BodyGenCallbackTy = 564 function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 565 566 // This is created primarily for sections construct as llvm::function_ref 567 // (BodyGenCallbackTy) is not storable (as described in the comments of 568 // function_ref class - function_ref contains non-ownable reference 569 // to the callable. 570 using StorableBodyGenCallbackTy = 571 std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 572 573 /// Callback type for loop body code generation. 574 /// 575 /// \param CodeGenIP is the insertion point where the loop's body code must be 576 /// placed. This will be a dedicated BasicBlock with a 577 /// conditional branch from the loop condition check and 578 /// terminated with an unconditional branch to the loop 579 /// latch. 580 /// \param IndVar is the induction variable usable at the insertion point. 581 using LoopBodyGenCallbackTy = 582 function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>; 583 584 /// Callback type for variable privatization (think copy & default 585 /// constructor). 586 /// 587 /// \param AllocaIP is the insertion point at which new alloca instructions 588 /// should be placed. 589 /// \param CodeGenIP is the insertion point at which the privatization code 590 /// should be placed. 591 /// \param Original The value being copied/created, should not be used in the 592 /// generated IR. 593 /// \param Inner The equivalent of \p Original that should be used in the 594 /// generated IR; this is equal to \p Original if the value is 595 /// a pointer and can thus be passed directly, otherwise it is 596 /// an equivalent but different value. 597 /// \param ReplVal The replacement value, thus a copy or new created version 598 /// of \p Inner. 599 /// 600 /// \returns The new insertion point where code generation continues and 601 /// \p ReplVal the replacement value. 602 using PrivatizeCallbackTy = function_ref<InsertPointTy( 603 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, 604 Value &Inner, Value *&ReplVal)>; 605 606 /// Description of a LLVM-IR insertion point (IP) and a debug/source location 607 /// (filename, line, column, ...). 608 struct LocationDescription { LocationDescriptionLocationDescription609 LocationDescription(const IRBuilderBase &IRB) 610 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {} LocationDescriptionLocationDescription611 LocationDescription(const InsertPointTy &IP) : IP(IP) {} LocationDescriptionLocationDescription612 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL) 613 : IP(IP), DL(DL) {} 614 InsertPointTy IP; 615 DebugLoc DL; 616 }; 617 618 /// Emitter methods for OpenMP directives. 619 /// 620 ///{ 621 622 /// Generator for '#omp barrier' 623 /// 624 /// \param Loc The location where the barrier directive was encountered. 625 /// \param Kind The kind of directive that caused the barrier. 626 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier. 627 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value 628 /// should be checked and acted upon. 629 /// \param ThreadID Optional parameter to pass in any existing ThreadID value. 630 /// 631 /// \returns The insertion point after the barrier. 632 InsertPointTy createBarrier(const LocationDescription &Loc, 633 omp::Directive Kind, bool ForceSimpleCall = false, 634 bool CheckCancelFlag = true); 635 636 /// Generator for '#omp cancel' 637 /// 638 /// \param Loc The location where the directive was encountered. 639 /// \param IfCondition The evaluated 'if' clause expression, if any. 640 /// \param CanceledDirective The kind of directive that is cancled. 641 /// 642 /// \returns The insertion point after the barrier. 643 InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, 644 omp::Directive CanceledDirective); 645 646 /// Generator for '#omp parallel' 647 /// 648 /// \param Loc The insert and source location description. 649 /// \param AllocaIP The insertion points to be used for alloca instructions. 650 /// \param BodyGenCB Callback that will generate the region code. 651 /// \param PrivCB Callback to copy a given variable (think copy constructor). 652 /// \param FiniCB Callback to finalize variable copies. 653 /// \param IfCondition The evaluated 'if' clause expression, if any. 654 /// \param NumThreads The evaluated 'num_threads' clause expression, if any. 655 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind). 656 /// \param IsCancellable Flag to indicate a cancellable parallel region. 657 /// 658 /// \returns The insertion position *after* the parallel. 659 IRBuilder<>::InsertPoint 660 createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, 661 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 662 FinalizeCallbackTy FiniCB, Value *IfCondition, 663 Value *NumThreads, omp::ProcBindKind ProcBind, 664 bool IsCancellable); 665 666 /// Generator for the control flow structure of an OpenMP canonical loop. 667 /// 668 /// This generator operates on the logical iteration space of the loop, i.e. 669 /// the caller only has to provide a loop trip count of the loop as defined by 670 /// base language semantics. The trip count is interpreted as an unsigned 671 /// integer. The induction variable passed to \p BodyGenCB will be of the same 672 /// type and run from 0 to \p TripCount - 1. It is up to the callback to 673 /// convert the logical iteration variable to the loop counter variable in the 674 /// loop body. 675 /// 676 /// \param Loc The insert and source location description. The insert 677 /// location can be between two instructions or the end of a 678 /// degenerate block (e.g. a BB under construction). 679 /// \param BodyGenCB Callback that will generate the loop body code. 680 /// \param TripCount Number of iterations the loop body is executed. 681 /// \param Name Base name used to derive BB and instruction names. 682 /// 683 /// \returns An object representing the created control flow structure which 684 /// can be used for loop-associated directives. 685 CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc, 686 LoopBodyGenCallbackTy BodyGenCB, 687 Value *TripCount, 688 const Twine &Name = "loop"); 689 690 /// Generator for the control flow structure of an OpenMP canonical loop. 691 /// 692 /// Instead of a logical iteration space, this allows specifying user-defined 693 /// loop counter values using increment, upper- and lower bounds. To 694 /// disambiguate the terminology when counting downwards, instead of lower 695 /// bounds we use \p Start for the loop counter value in the first body 696 /// iteration. 697 /// 698 /// Consider the following limitations: 699 /// 700 /// * A loop counter space over all integer values of its bit-width cannot be 701 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be 702 /// stored into an 8 bit integer): 703 /// 704 /// DO I = 0, 255, 1 705 /// 706 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g. 707 /// effectively counting downwards: 708 /// 709 /// for (uint8_t i = 100u; i > 0; i += 127u) 710 /// 711 /// 712 /// TODO: May need to add additional parameters to represent: 713 /// 714 /// * Allow representing downcounting with unsigned integers. 715 /// 716 /// * Sign of the step and the comparison operator might disagree: 717 /// 718 /// for (int i = 0; i < 42; i -= 1u) 719 /// 720 // 721 /// \param Loc The insert and source location description. 722 /// \param BodyGenCB Callback that will generate the loop body code. 723 /// \param Start Value of the loop counter for the first iterations. 724 /// \param Stop Loop counter values past this will stop the loop. 725 /// \param Step Loop counter increment after each iteration; negative 726 /// means counting down. 727 /// \param IsSigned Whether Start, Stop and Step are signed integers. 728 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop 729 /// counter. 730 /// \param ComputeIP Insertion point for instructions computing the trip 731 /// count. Can be used to ensure the trip count is available 732 /// at the outermost loop of a loop nest. If not set, 733 /// defaults to the preheader of the generated loop. 734 /// \param Name Base name used to derive BB and instruction names. 735 /// 736 /// \returns An object representing the created control flow structure which 737 /// can be used for loop-associated directives. 738 CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc, 739 LoopBodyGenCallbackTy BodyGenCB, 740 Value *Start, Value *Stop, Value *Step, 741 bool IsSigned, bool InclusiveStop, 742 InsertPointTy ComputeIP = {}, 743 const Twine &Name = "loop"); 744 745 /// Collapse a loop nest into a single loop. 746 /// 747 /// Merges loops of a loop nest into a single CanonicalLoopNest representation 748 /// that has the same number of innermost loop iterations as the origin loop 749 /// nest. The induction variables of the input loops are derived from the 750 /// collapsed loop's induction variable. This is intended to be used to 751 /// implement OpenMP's collapse clause. Before applying a directive, 752 /// collapseLoops normalizes a loop nest to contain only a single loop and the 753 /// directive's implementation does not need to handle multiple loops itself. 754 /// This does not remove the need to handle all loop nest handling by 755 /// directives, such as the ordered(<n>) clause or the simd schedule-clause 756 /// modifier of the worksharing-loop directive. 757 /// 758 /// Example: 759 /// \code 760 /// for (int i = 0; i < 7; ++i) // Canonical loop "i" 761 /// for (int j = 0; j < 9; ++j) // Canonical loop "j" 762 /// body(i, j); 763 /// \endcode 764 /// 765 /// After collapsing with Loops={i,j}, the loop is changed to 766 /// \code 767 /// for (int ij = 0; ij < 63; ++ij) { 768 /// int i = ij / 9; 769 /// int j = ij % 9; 770 /// body(i, j); 771 /// } 772 /// \endcode 773 /// 774 /// In the current implementation, the following limitations apply: 775 /// 776 /// * All input loops have an induction variable of the same type. 777 /// 778 /// * The collapsed loop will have the same trip count integer type as the 779 /// input loops. Therefore it is possible that the collapsed loop cannot 780 /// represent all iterations of the input loops. For instance, assuming a 781 /// 32 bit integer type, and two input loops both iterating 2^16 times, the 782 /// theoretical trip count of the collapsed loop would be 2^32 iteration, 783 /// which cannot be represented in an 32-bit integer. Behavior is undefined 784 /// in this case. 785 /// 786 /// * The trip counts of every input loop must be available at \p ComputeIP. 787 /// Non-rectangular loops are not yet supported. 788 /// 789 /// * At each nest level, code between a surrounding loop and its nested loop 790 /// is hoisted into the loop body, and such code will be executed more 791 /// often than before collapsing (or not at all if any inner loop iteration 792 /// has a trip count of 0). This is permitted by the OpenMP specification. 793 /// 794 /// \param DL Debug location for instructions added for collapsing, 795 /// such as instructions to compute/derive the input loop's 796 /// induction variables. 797 /// \param Loops Loops in the loop nest to collapse. Loops are specified 798 /// from outermost-to-innermost and every control flow of a 799 /// loop's body must pass through its directly nested loop. 800 /// \param ComputeIP Where additional instruction that compute the collapsed 801 /// trip count. If not set, defaults to before the generated 802 /// loop. 803 /// 804 /// \returns The CanonicalLoopInfo object representing the collapsed loop. 805 CanonicalLoopInfo *collapseLoops(DebugLoc DL, 806 ArrayRef<CanonicalLoopInfo *> Loops, 807 InsertPointTy ComputeIP); 808 809 /// Get the default alignment value for given target 810 /// 811 /// \param TargetTriple Target triple 812 /// \param Features StringMap which describes extra CPU features 813 static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, 814 const StringMap<bool> &Features); 815 816 /// Retrieve (or create if non-existent) the address of a declare 817 /// target variable, used in conjunction with registerTargetGlobalVariable 818 /// to create declare target global variables. 819 /// 820 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 821 /// clause used in conjunction with the variable being registered (link, 822 /// to, enter). 823 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 824 /// clause used in conjunction with the variable being registered (nohost, 825 /// host, any) 826 /// \param IsDeclaration - boolean stating if the variable being registered 827 /// is a declaration-only and not a definition 828 /// \param IsExternallyVisible - boolean stating if the variable is externally 829 /// visible 830 /// \param EntryInfo - Unique entry information for the value generated 831 /// using getTargetEntryUniqueInfo, used to name generated pointer references 832 /// to the declare target variable 833 /// \param MangledName - the mangled name of the variable being registered 834 /// \param GeneratedRefs - references generated by invocations of 835 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar, 836 /// these are required by Clang for book keeping. 837 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 838 /// \param TargetTriple - The OpenMP device target triple we are compiling 839 /// for 840 /// \param LlvmPtrTy - The type of the variable we are generating or 841 /// retrieving an address for 842 /// \param GlobalInitializer - a lambda function which creates a constant 843 /// used for initializing a pointer reference to the variable in certain 844 /// cases. If a nullptr is passed, it will default to utilising the original 845 /// variable to initialize the pointer reference. 846 /// \param VariableLinkage - a lambda function which returns the variables 847 /// linkage type, if unspecified and a nullptr is given, it will instead 848 /// utilise the linkage stored on the existing global variable in the 849 /// LLVMModule. 850 Constant *getAddrOfDeclareTargetVar( 851 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 852 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 853 bool IsDeclaration, bool IsExternallyVisible, 854 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 855 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 856 std::vector<Triple> TargetTriple, Type *LlvmPtrTy, 857 std::function<Constant *()> GlobalInitializer, 858 std::function<GlobalValue::LinkageTypes()> VariableLinkage); 859 860 /// Registers a target variable for device or host. 861 /// 862 /// \param CaptureClause - enumerator corresponding to the OpenMP capture 863 /// clause used in conjunction with the variable being registered (link, 864 /// to, enter). 865 /// \param DeviceClause - enumerator corresponding to the OpenMP capture 866 /// clause used in conjunction with the variable being registered (nohost, 867 /// host, any) 868 /// \param IsDeclaration - boolean stating if the variable being registered 869 /// is a declaration-only and not a definition 870 /// \param IsExternallyVisible - boolean stating if the variable is externally 871 /// visible 872 /// \param EntryInfo - Unique entry information for the value generated 873 /// using getTargetEntryUniqueInfo, used to name generated pointer references 874 /// to the declare target variable 875 /// \param MangledName - the mangled name of the variable being registered 876 /// \param GeneratedRefs - references generated by invocations of 877 /// registerTargetGlobalVariable these are required by Clang for book 878 /// keeping. 879 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled 880 /// \param TargetTriple - The OpenMP device target triple we are compiling 881 /// for 882 /// \param GlobalInitializer - a lambda function which creates a constant 883 /// used for initializing a pointer reference to the variable in certain 884 /// cases. If a nullptr is passed, it will default to utilising the original 885 /// variable to initialize the pointer reference. 886 /// \param VariableLinkage - a lambda function which returns the variables 887 /// linkage type, if unspecified and a nullptr is given, it will instead 888 /// utilise the linkage stored on the existing global variable in the 889 /// LLVMModule. 890 /// \param LlvmPtrTy - The type of the variable we are generating or 891 /// retrieving an address for 892 /// \param Addr - the original llvm value (addr) of the variable to be 893 /// registered 894 void registerTargetGlobalVariable( 895 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 896 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 897 bool IsDeclaration, bool IsExternallyVisible, 898 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 899 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 900 std::vector<Triple> TargetTriple, 901 std::function<Constant *()> GlobalInitializer, 902 std::function<GlobalValue::LinkageTypes()> VariableLinkage, 903 Type *LlvmPtrTy, Constant *Addr); 904 905 /// Get the offset of the OMP_MAP_MEMBER_OF field. 906 unsigned getFlagMemberOffset(); 907 908 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on 909 /// the position given. 910 /// \param Position - A value indicating the position of the parent 911 /// of the member in the kernel argument structure, often retrieved 912 /// by the parents position in the combined information vectors used 913 /// to generate the structure itself. Multiple children (member's of) 914 /// with the same parent will use the same returned member flag. 915 omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position); 916 917 /// Given an initial flag set, this function modifies it to contain 918 /// the passed in MemberOfFlag generated from the getMemberOfFlag 919 /// function. The results are dependent on the existing flag bits 920 /// set in the original flag set. 921 /// \param Flags - The original set of flags to be modified with the 922 /// passed in MemberOfFlag. 923 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted 924 /// slightly based on the getMemberOfFlag which adjusts the flag bits 925 /// based on the members position in its parent. 926 void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, 927 omp::OpenMPOffloadMappingFlags MemberOfFlag); 928 929 private: 930 /// Modifies the canonical loop to be a statically-scheduled workshare loop 931 /// which is executed on the device 932 /// 933 /// This takes a \p CLI representing a canonical loop, such as the one 934 /// created by \see createCanonicalLoop and emits additional instructions to 935 /// turn it into a workshare loop. In particular, it calls to an OpenMP 936 /// runtime function in the preheader to call OpenMP device rtl function 937 /// which handles worksharing of loop body interations. 938 /// 939 /// \param DL Debug location for instructions added for the 940 /// workshare-loop construct itself. 941 /// \param CLI A descriptor of the canonical loop to workshare. 942 /// \param AllocaIP An insertion point for Alloca instructions usable in the 943 /// preheader of the loop. 944 /// \param LoopType Information about type of loop worksharing. 945 /// It corresponds to type of loop workshare OpenMP pragma. 946 /// 947 /// \returns Point where to insert code after the workshare construct. 948 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, 949 InsertPointTy AllocaIP, 950 omp::WorksharingLoopType LoopType); 951 952 /// Modifies the canonical loop to be a statically-scheduled workshare loop. 953 /// 954 /// This takes a \p LoopInfo representing a canonical loop, such as the one 955 /// created by \p createCanonicalLoop and emits additional instructions to 956 /// turn it into a workshare loop. In particular, it calls to an OpenMP 957 /// runtime function in the preheader to obtain the loop bounds to be used in 958 /// the current thread, updates the relevant instructions in the canonical 959 /// loop and calls to an OpenMP runtime finalization function after the loop. 960 /// 961 /// \param DL Debug location for instructions added for the 962 /// workshare-loop construct itself. 963 /// \param CLI A descriptor of the canonical loop to workshare. 964 /// \param AllocaIP An insertion point for Alloca instructions usable in the 965 /// preheader of the loop. 966 /// \param NeedsBarrier Indicates whether a barrier must be inserted after 967 /// the loop. 968 /// 969 /// \returns Point where to insert code after the workshare construct. 970 InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 971 InsertPointTy AllocaIP, 972 bool NeedsBarrier); 973 974 /// Modifies the canonical loop a statically-scheduled workshare loop with a 975 /// user-specified chunk size. 976 /// 977 /// \param DL Debug location for instructions added for the 978 /// workshare-loop construct itself. 979 /// \param CLI A descriptor of the canonical loop to workshare. 980 /// \param AllocaIP An insertion point for Alloca instructions usable in 981 /// the preheader of the loop. 982 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the 983 /// loop. 984 /// \param ChunkSize The user-specified chunk size. 985 /// 986 /// \returns Point where to insert code after the workshare construct. 987 InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL, 988 CanonicalLoopInfo *CLI, 989 InsertPointTy AllocaIP, 990 bool NeedsBarrier, 991 Value *ChunkSize); 992 993 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. 994 /// 995 /// This takes a \p LoopInfo representing a canonical loop, such as the one 996 /// created by \p createCanonicalLoop and emits additional instructions to 997 /// turn it into a workshare loop. In particular, it calls to an OpenMP 998 /// runtime function in the preheader to obtain, and then in each iteration 999 /// to update the loop counter. 1000 /// 1001 /// \param DL Debug location for instructions added for the 1002 /// workshare-loop construct itself. 1003 /// \param CLI A descriptor of the canonical loop to workshare. 1004 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1005 /// preheader of the loop. 1006 /// \param SchedType Type of scheduling to be passed to the init function. 1007 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1008 /// the loop. 1009 /// \param Chunk The size of loop chunk considered as a unit when 1010 /// scheduling. If \p nullptr, defaults to 1. 1011 /// 1012 /// \returns Point where to insert code after the workshare construct. 1013 InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 1014 InsertPointTy AllocaIP, 1015 omp::OMPScheduleType SchedType, 1016 bool NeedsBarrier, 1017 Value *Chunk = nullptr); 1018 1019 /// Create alternative version of the loop to support if clause 1020 /// 1021 /// OpenMP if clause can require to generate second loop. This loop 1022 /// will be executed when if clause condition is not met. createIfVersion 1023 /// adds branch instruction to the copied loop if \p ifCond is not met. 1024 /// 1025 /// \param Loop Original loop which should be versioned. 1026 /// \param IfCond Value which corresponds to if clause condition 1027 /// \param VMap Value to value map to define relation between 1028 /// original and copied loop values and loop blocks. 1029 /// \param NamePrefix Optional name prefix for if.then if.else blocks. 1030 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond, 1031 ValueToValueMapTy &VMap, const Twine &NamePrefix = ""); 1032 1033 public: 1034 /// Modifies the canonical loop to be a workshare loop. 1035 /// 1036 /// This takes a \p LoopInfo representing a canonical loop, such as the one 1037 /// created by \p createCanonicalLoop and emits additional instructions to 1038 /// turn it into a workshare loop. In particular, it calls to an OpenMP 1039 /// runtime function in the preheader to obtain the loop bounds to be used in 1040 /// the current thread, updates the relevant instructions in the canonical 1041 /// loop and calls to an OpenMP runtime finalization function after the loop. 1042 /// 1043 /// The concrete transformation is done by applyStaticWorkshareLoop, 1044 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending 1045 /// on the value of \p SchedKind and \p ChunkSize. 1046 /// 1047 /// \param DL Debug location for instructions added for the 1048 /// workshare-loop construct itself. 1049 /// \param CLI A descriptor of the canonical loop to workshare. 1050 /// \param AllocaIP An insertion point for Alloca instructions usable in the 1051 /// preheader of the loop. 1052 /// \param NeedsBarrier Indicates whether a barrier must be insterted after 1053 /// the loop. 1054 /// \param SchedKind Scheduling algorithm to use. 1055 /// \param ChunkSize The chunk size for the inner loop. 1056 /// \param HasSimdModifier Whether the simd modifier is present in the 1057 /// schedule clause. 1058 /// \param HasMonotonicModifier Whether the monotonic modifier is present in 1059 /// the schedule clause. 1060 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is 1061 /// present in the schedule clause. 1062 /// \param HasOrderedClause Whether the (parameterless) ordered clause is 1063 /// present. 1064 /// \param LoopType Information about type of loop worksharing. 1065 /// It corresponds to type of loop workshare OpenMP pragma. 1066 /// 1067 /// \returns Point where to insert code after the workshare construct. 1068 InsertPointTy applyWorkshareLoop( 1069 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 1070 bool NeedsBarrier, 1071 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default, 1072 Value *ChunkSize = nullptr, bool HasSimdModifier = false, 1073 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false, 1074 bool HasOrderedClause = false, 1075 omp::WorksharingLoopType LoopType = 1076 omp::WorksharingLoopType::ForStaticLoop); 1077 1078 /// Tile a loop nest. 1079 /// 1080 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in 1081 /// \p/ Loops must be perfectly nested, from outermost to innermost loop 1082 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value 1083 /// of every loop and every tile sizes must be usable in the outermost 1084 /// loop's preheader. This implies that the loop nest is rectangular. 1085 /// 1086 /// Example: 1087 /// \code 1088 /// for (int i = 0; i < 15; ++i) // Canonical loop "i" 1089 /// for (int j = 0; j < 14; ++j) // Canonical loop "j" 1090 /// body(i, j); 1091 /// \endcode 1092 /// 1093 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to 1094 /// \code 1095 /// for (int i1 = 0; i1 < 3; ++i1) 1096 /// for (int j1 = 0; j1 < 2; ++j1) 1097 /// for (int i2 = 0; i2 < 5; ++i2) 1098 /// for (int j2 = 0; j2 < 7; ++j2) 1099 /// body(i1*3+i2, j1*3+j2); 1100 /// \endcode 1101 /// 1102 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are 1103 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also 1104 /// handles non-constant trip counts, non-constant tile sizes and trip counts 1105 /// that are not multiples of the tile size. In the latter case the tile loop 1106 /// of the last floor-loop iteration will have fewer iterations than specified 1107 /// as its tile size. 1108 /// 1109 /// 1110 /// @param DL Debug location for instructions added by tiling, for 1111 /// instance the floor- and tile trip count computation. 1112 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are 1113 /// invalidated by this method, i.e. should not used after 1114 /// tiling. 1115 /// @param TileSizes For each loop in \p Loops, the tile size for that 1116 /// dimensions. 1117 /// 1118 /// \returns A list of generated loops. Contains twice as many loops as the 1119 /// input loop nest; the first half are the floor loops and the 1120 /// second half are the tile loops. 1121 std::vector<CanonicalLoopInfo *> 1122 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 1123 ArrayRef<Value *> TileSizes); 1124 1125 /// Fully unroll a loop. 1126 /// 1127 /// Instead of unrolling the loop immediately (and duplicating its body 1128 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop 1129 /// metadata. 1130 /// 1131 /// \param DL Debug location for instructions added by unrolling. 1132 /// \param Loop The loop to unroll. The loop will be invalidated. 1133 void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop); 1134 1135 /// Fully or partially unroll a loop. How the loop is unrolled is determined 1136 /// using LLVM's LoopUnrollPass. 1137 /// 1138 /// \param DL Debug location for instructions added by unrolling. 1139 /// \param Loop The loop to unroll. The loop will be invalidated. 1140 void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop); 1141 1142 /// Partially unroll a loop. 1143 /// 1144 /// The CanonicalLoopInfo of the unrolled loop for use with chained 1145 /// loop-associated directive can be requested using \p UnrolledCLI. Not 1146 /// needing the CanonicalLoopInfo allows more efficient code generation by 1147 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata. 1148 /// A loop-associated directive applied to the unrolled loop needs to know the 1149 /// new trip count which means that if using a heuristically determined unroll 1150 /// factor (\p Factor == 0), that factor must be computed immediately. We are 1151 /// using the same logic as the LoopUnrollPass to derived the unroll factor, 1152 /// but which assumes that some canonicalization has taken place (e.g. 1153 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform 1154 /// better when the unrolled loop's CanonicalLoopInfo is not needed. 1155 /// 1156 /// \param DL Debug location for instructions added by unrolling. 1157 /// \param Loop The loop to unroll. The loop will be invalidated. 1158 /// \param Factor The factor to unroll the loop by. A factor of 0 1159 /// indicates that a heuristic should be used to determine 1160 /// the unroll-factor. 1161 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the 1162 /// partially unrolled loop. Otherwise, uses loop metadata 1163 /// to defer unrolling to the LoopUnrollPass. 1164 void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, 1165 CanonicalLoopInfo **UnrolledCLI); 1166 1167 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop 1168 /// is cloned. The metadata which prevents vectorization is added to 1169 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated 1170 /// to false. 1171 /// 1172 /// \param Loop The loop to simd-ize. 1173 /// \param AlignedVars The map which containts pairs of the pointer 1174 /// and its corresponding alignment. 1175 /// \param IfCond The value which corresponds to the if clause 1176 /// condition. 1177 /// \param Order The enum to map order clause. 1178 /// \param Simdlen The Simdlen length to apply to the simd loop. 1179 /// \param Safelen The Safelen length to apply to the simd loop. 1180 void applySimd(CanonicalLoopInfo *Loop, 1181 MapVector<Value *, Value *> AlignedVars, Value *IfCond, 1182 omp::OrderKind Order, ConstantInt *Simdlen, 1183 ConstantInt *Safelen); 1184 1185 /// Generator for '#omp flush' 1186 /// 1187 /// \param Loc The location where the flush directive was encountered 1188 void createFlush(const LocationDescription &Loc); 1189 1190 /// Generator for '#omp taskwait' 1191 /// 1192 /// \param Loc The location where the taskwait directive was encountered. 1193 void createTaskwait(const LocationDescription &Loc); 1194 1195 /// Generator for '#omp taskyield' 1196 /// 1197 /// \param Loc The location where the taskyield directive was encountered. 1198 void createTaskyield(const LocationDescription &Loc); 1199 1200 /// A struct to pack the relevant information for an OpenMP depend clause. 1201 struct DependData { 1202 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown; 1203 Type *DepValueType; 1204 Value *DepVal; 1205 explicit DependData() = default; DependDataDependData1206 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, 1207 Value *DepVal) 1208 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {} 1209 }; 1210 1211 /// Generator for `#omp task` 1212 /// 1213 /// \param Loc The location where the task construct was encountered. 1214 /// \param AllocaIP The insertion point to be used for alloca instructions. 1215 /// \param BodyGenCB Callback that will generate the region code. 1216 /// \param Tied True if the task is tied, false if the task is untied. 1217 /// \param Final i1 value which is `true` if the task is final, `false` if the 1218 /// task is not final. 1219 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred 1220 /// task is generated, and the encountering thread must 1221 /// suspend the current task region, for which execution 1222 /// cannot be resumed until execution of the structured 1223 /// block that is associated with the generated task is 1224 /// completed. 1225 InsertPointTy createTask(const LocationDescription &Loc, 1226 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, 1227 bool Tied = true, Value *Final = nullptr, 1228 Value *IfCondition = nullptr, 1229 SmallVector<DependData> Dependencies = {}); 1230 1231 /// Generator for the taskgroup construct 1232 /// 1233 /// \param Loc The location where the taskgroup construct was encountered. 1234 /// \param AllocaIP The insertion point to be used for alloca instructions. 1235 /// \param BodyGenCB Callback that will generate the region code. 1236 InsertPointTy createTaskgroup(const LocationDescription &Loc, 1237 InsertPointTy AllocaIP, 1238 BodyGenCallbackTy BodyGenCB); 1239 1240 using FileIdentifierInfoCallbackTy = 1241 std::function<std::tuple<std::string, uint64_t>()>; 1242 1243 /// Creates a unique info for a target entry when provided a filename and 1244 /// line number from. 1245 /// 1246 /// \param CallBack A callback function which should return filename the entry 1247 /// resides in as well as the line number for the target entry 1248 /// \param ParentName The name of the parent the target entry resides in, if 1249 /// any. 1250 static TargetRegionEntryInfo 1251 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, 1252 StringRef ParentName = ""); 1253 1254 /// Enum class for the RedctionGen CallBack type to be used. 1255 enum class ReductionGenCBKind { Clang, MLIR }; 1256 1257 /// ReductionGen CallBack for Clang 1258 /// 1259 /// \param CodeGenIP InsertPoint for CodeGen. 1260 /// \param Index Index of the ReductionInfo to generate code for. 1261 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for 1262 /// codegen, used for fixup later. 1263 /// \param RHSPtr Optionally used by Clang to 1264 /// return the RHSPtr it used for codegen, used for fixup later. 1265 /// \param CurFn Optionally used by Clang to pass in the Current Function as 1266 /// Clang context may be old. 1267 using ReductionGenClangCBTy = 1268 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, 1269 Value **LHS, Value **RHS, Function *CurFn)>; 1270 1271 /// ReductionGen CallBack for MLIR 1272 /// 1273 /// \param CodeGenIP InsertPoint for CodeGen. 1274 /// \param LHS Pass in the LHS Value to be used for CodeGen. 1275 /// \param RHS Pass in the RHS Value to be used for CodeGen. 1276 using ReductionGenCBTy = std::function<InsertPointTy( 1277 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>; 1278 1279 /// Functions used to generate atomic reductions. Such functions take two 1280 /// Values representing pointers to LHS and RHS of the reduction, as well as 1281 /// the element type of these pointers. They are expected to atomically 1282 /// update the LHS to the reduced value. 1283 using ReductionGenAtomicCBTy = 1284 std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>; 1285 1286 /// Enum class for reduction evaluation types scalar, complex and aggregate. 1287 enum class EvalKind { Scalar, Complex, Aggregate }; 1288 1289 /// Information about an OpenMP reduction. 1290 struct ReductionInfo { ReductionInfoReductionInfo1291 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, 1292 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, 1293 ReductionGenClangCBTy ReductionGenClang, 1294 ReductionGenAtomicCBTy AtomicReductionGen) 1295 : ElementType(ElementType), Variable(Variable), 1296 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind), 1297 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang), 1298 AtomicReductionGen(AtomicReductionGen) {} ReductionInfoReductionInfo1299 ReductionInfo(Value *PrivateVariable) 1300 : ElementType(nullptr), Variable(nullptr), 1301 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar), 1302 ReductionGen(), ReductionGenClang(), AtomicReductionGen() {} 1303 1304 /// Reduction element type, must match pointee type of variable. 1305 Type *ElementType; 1306 1307 /// Reduction variable of pointer type. 1308 Value *Variable; 1309 1310 /// Thread-private partial reduction variable. 1311 Value *PrivateVariable; 1312 1313 /// Reduction evaluation kind - scalar, complex or aggregate. 1314 EvalKind EvaluationKind; 1315 1316 /// Callback for generating the reduction body. The IR produced by this will 1317 /// be used to combine two values in a thread-safe context, e.g., under 1318 /// lock or within the same thread, and therefore need not be atomic. 1319 ReductionGenCBTy ReductionGen; 1320 1321 /// Clang callback for generating the reduction body. The IR produced by 1322 /// this will be used to combine two values in a thread-safe context, e.g., 1323 /// under lock or within the same thread, and therefore need not be atomic. 1324 ReductionGenClangCBTy ReductionGenClang; 1325 1326 /// Callback for generating the atomic reduction body, may be null. The IR 1327 /// produced by this will be used to atomically combine two values during 1328 /// reduction. If null, the implementation will use the non-atomic version 1329 /// along with the appropriate synchronization mechanisms. 1330 ReductionGenAtomicCBTy AtomicReductionGen; 1331 }; 1332 1333 enum class CopyAction : unsigned { 1334 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 1335 // the warp using shuffle instructions. 1336 RemoteLaneToThread, 1337 // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 1338 ThreadCopy, 1339 }; 1340 1341 struct CopyOptionsTy { 1342 Value *RemoteLaneOffset = nullptr; 1343 Value *ScratchpadIndex = nullptr; 1344 Value *ScratchpadWidth = nullptr; 1345 }; 1346 1347 /// Supporting functions for Reductions CodeGen. 1348 private: 1349 /// Emit the llvm.used metadata. 1350 void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List); 1351 1352 /// Get the id of the current thread on the GPU. 1353 Value *getGPUThreadID(); 1354 1355 /// Get the GPU warp size. 1356 Value *getGPUWarpSize(); 1357 1358 /// Get the id of the warp in the block. 1359 /// We assume that the warp size is 32, which is always the case 1360 /// on the NVPTX device, to generate more efficient code. 1361 Value *getNVPTXWarpID(); 1362 1363 /// Get the id of the current lane in the Warp. 1364 /// We assume that the warp size is 32, which is always the case 1365 /// on the NVPTX device, to generate more efficient code. 1366 Value *getNVPTXLaneID(); 1367 1368 /// Cast value to the specified type. 1369 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType); 1370 1371 /// This function creates calls to one of two shuffle functions to copy 1372 /// variables between lanes in a warp. 1373 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element, 1374 Type *ElementType, Value *Offset); 1375 1376 /// Function to shuffle over the value from the remote lane. 1377 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr, 1378 Type *ElementType, Value *Offset, 1379 Type *ReductionArrayTy); 1380 1381 /// Emit instructions to copy a Reduce list, which contains partially 1382 /// aggregated values, in the specified direction. 1383 void emitReductionListCopy( 1384 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy, 1385 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase, 1386 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}); 1387 1388 /// Emit a helper that reduces data across two OpenMP threads (lanes) 1389 /// in the same warp. It uses shuffle instructions to copy over data from 1390 /// a remote lane's stack. The reduction algorithm performed is specified 1391 /// by the fourth parameter. 1392 /// 1393 /// Algorithm Versions. 1394 /// Full Warp Reduce (argument value 0): 1395 /// This algorithm assumes that all 32 lanes are active and gathers 1396 /// data from these 32 lanes, producing a single resultant value. 1397 /// Contiguous Partial Warp Reduce (argument value 1): 1398 /// This algorithm assumes that only a *contiguous* subset of lanes 1399 /// are active. This happens for the last warp in a parallel region 1400 /// when the user specified num_threads is not an integer multiple of 1401 /// 32. This contiguous subset always starts with the zeroth lane. 1402 /// Partial Warp Reduce (argument value 2): 1403 /// This algorithm gathers data from any number of lanes at any position. 1404 /// All reduced values are stored in the lowest possible lane. The set 1405 /// of problems every algorithm addresses is a super set of those 1406 /// addressable by algorithms with a lower version number. Overhead 1407 /// increases as algorithm version increases. 1408 /// 1409 /// Terminology 1410 /// Reduce element: 1411 /// Reduce element refers to the individual data field with primitive 1412 /// data types to be combined and reduced across threads. 1413 /// Reduce list: 1414 /// Reduce list refers to a collection of local, thread-private 1415 /// reduce elements. 1416 /// Remote Reduce list: 1417 /// Remote Reduce list refers to a collection of remote (relative to 1418 /// the current thread) reduce elements. 1419 /// 1420 /// We distinguish between three states of threads that are important to 1421 /// the implementation of this function. 1422 /// Alive threads: 1423 /// Threads in a warp executing the SIMT instruction, as distinguished from 1424 /// threads that are inactive due to divergent control flow. 1425 /// Active threads: 1426 /// The minimal set of threads that has to be alive upon entry to this 1427 /// function. The computation is correct iff active threads are alive. 1428 /// Some threads are alive but they are not active because they do not 1429 /// contribute to the computation in any useful manner. Turning them off 1430 /// may introduce control flow overheads without any tangible benefits. 1431 /// Effective threads: 1432 /// In order to comply with the argument requirements of the shuffle 1433 /// function, we must keep all lanes holding data alive. But at most 1434 /// half of them perform value aggregation; we refer to this half of 1435 /// threads as effective. The other half is simply handing off their 1436 /// data. 1437 /// 1438 /// Procedure 1439 /// Value shuffle: 1440 /// In this step active threads transfer data from higher lane positions 1441 /// in the warp to lower lane positions, creating Remote Reduce list. 1442 /// Value aggregation: 1443 /// In this step, effective threads combine their thread local Reduce list 1444 /// with Remote Reduce list and store the result in the thread local 1445 /// Reduce list. 1446 /// Value copy: 1447 /// In this step, we deal with the assumption made by algorithm 2 1448 /// (i.e. contiguity assumption). When we have an odd number of lanes 1449 /// active, say 2k+1, only k threads will be effective and therefore k 1450 /// new values will be produced. However, the Reduce list owned by the 1451 /// (2k+1)th thread is ignored in the value aggregation. Therefore 1452 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 1453 /// that the contiguity assumption still holds. 1454 /// 1455 /// \param ReductionInfos Array type containing the ReductionOps. 1456 /// \param ReduceFn The reduction function. 1457 /// \param FuncAttrs Optional param to specify any function attributes that 1458 /// need to be copied to the new function. 1459 /// 1460 /// \return The ShuffleAndReduce function. 1461 Function *emitShuffleAndReduceFunction( 1462 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, 1463 Function *ReduceFn, AttributeList FuncAttrs); 1464 1465 /// This function emits a helper that gathers Reduce lists from the first 1466 /// lane of every active warp to lanes in the first warp. 1467 /// 1468 /// void inter_warp_copy_func(void* reduce_data, num_warps) 1469 /// shared smem[warp_size]; 1470 /// For all data entries D in reduce_data: 1471 /// sync 1472 /// If (I am the first lane in each warp) 1473 /// Copy my local D to smem[warp_id] 1474 /// sync 1475 /// if (I am the first warp) 1476 /// Copy smem[thread_id] to my local D 1477 /// 1478 /// \param Loc The insert and source location description. 1479 /// \param ReductionInfos Array type containing the ReductionOps. 1480 /// \param FuncAttrs Optional param to specify any function attributes that 1481 /// need to be copied to the new function. 1482 /// 1483 /// \return The InterWarpCopy function. 1484 Function *emitInterWarpCopyFunction(const LocationDescription &Loc, 1485 ArrayRef<ReductionInfo> ReductionInfos, 1486 AttributeList FuncAttrs); 1487 1488 /// This function emits a helper that copies all the reduction variables from 1489 /// the team into the provided global buffer for the reduction variables. 1490 /// 1491 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1492 /// For all data entries D in reduce_data: 1493 /// Copy local D to buffer.D[Idx] 1494 /// 1495 /// \param ReductionInfos Array type containing the ReductionOps. 1496 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1497 /// \param FuncAttrs Optional param to specify any function attributes that 1498 /// need to be copied to the new function. 1499 /// 1500 /// \return The ListToGlobalCopy function. 1501 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1502 Type *ReductionsBufferTy, 1503 AttributeList FuncAttrs); 1504 1505 /// This function emits a helper that copies all the reduction variables from 1506 /// the team into the provided global buffer for the reduction variables. 1507 /// 1508 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 1509 /// For all data entries D in reduce_data: 1510 /// Copy buffer.D[Idx] to local D; 1511 /// 1512 /// \param ReductionInfos Array type containing the ReductionOps. 1513 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1514 /// \param FuncAttrs Optional param to specify any function attributes that 1515 /// need to be copied to the new function. 1516 /// 1517 /// \return The GlobalToList function. 1518 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos, 1519 Type *ReductionsBufferTy, 1520 AttributeList FuncAttrs); 1521 1522 /// This function emits a helper that reduces all the reduction variables from 1523 /// the team into the provided global buffer for the reduction variables. 1524 /// 1525 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 1526 /// void *GlobPtrs[]; 1527 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1528 /// ... 1529 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1530 /// reduce_function(GlobPtrs, reduce_data); 1531 /// 1532 /// \param ReductionInfos Array type containing the ReductionOps. 1533 /// \param ReduceFn The reduction function. 1534 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1535 /// \param FuncAttrs Optional param to specify any function attributes that 1536 /// need to be copied to the new function. 1537 /// 1538 /// \return The ListToGlobalReduce function. 1539 Function * 1540 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1541 Function *ReduceFn, Type *ReductionsBufferTy, 1542 AttributeList FuncAttrs); 1543 1544 /// This function emits a helper that reduces all the reduction variables from 1545 /// the team into the provided global buffer for the reduction variables. 1546 /// 1547 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 1548 /// void *GlobPtrs[]; 1549 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 1550 /// ... 1551 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 1552 /// reduce_function(reduce_data, GlobPtrs); 1553 /// 1554 /// \param ReductionInfos Array type containing the ReductionOps. 1555 /// \param ReduceFn The reduction function. 1556 /// \param ReductionsBufferTy The StructTy for the reductions buffer. 1557 /// \param FuncAttrs Optional param to specify any function attributes that 1558 /// need to be copied to the new function. 1559 /// 1560 /// \return The GlobalToListReduce function. 1561 Function * 1562 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos, 1563 Function *ReduceFn, Type *ReductionsBufferTy, 1564 AttributeList FuncAttrs); 1565 1566 /// Get the function name of a reduction function. 1567 std::string getReductionFuncName(StringRef Name) const; 1568 1569 /// Emits reduction function. 1570 /// \param ReducerName Name of the function calling the reduction. 1571 /// \param ReductionInfos Array type containing the ReductionOps. 1572 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR 1573 /// CodeGenCB kind. 1574 /// \param FuncAttrs Optional param to specify any function attributes that 1575 /// need to be copied to the new function. 1576 /// 1577 /// \return The reduction function. 1578 Function *createReductionFunction( 1579 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, 1580 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1581 AttributeList FuncAttrs = {}); 1582 1583 public: 1584 /// 1585 /// Design of OpenMP reductions on the GPU 1586 /// 1587 /// Consider a typical OpenMP program with one or more reduction 1588 /// clauses: 1589 /// 1590 /// float foo; 1591 /// double bar; 1592 /// #pragma omp target teams distribute parallel for \ 1593 /// reduction(+:foo) reduction(*:bar) 1594 /// for (int i = 0; i < N; i++) { 1595 /// foo += A[i]; bar *= B[i]; 1596 /// } 1597 /// 1598 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1599 /// all teams. In our OpenMP implementation on the NVPTX device an 1600 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1601 /// within a team are mapped to CUDA threads within a threadblock. 1602 /// Our goal is to efficiently aggregate values across all OpenMP 1603 /// threads such that: 1604 /// 1605 /// - the compiler and runtime are logically concise, and 1606 /// - the reduction is performed efficiently in a hierarchical 1607 /// manner as follows: within OpenMP threads in the same warp, 1608 /// across warps in a threadblock, and finally across teams on 1609 /// the NVPTX device. 1610 /// 1611 /// Introduction to Decoupling 1612 /// 1613 /// We would like to decouple the compiler and the runtime so that the 1614 /// latter is ignorant of the reduction variables (number, data types) 1615 /// and the reduction operators. This allows a simpler interface 1616 /// and implementation while still attaining good performance. 1617 /// 1618 /// Pseudocode for the aforementioned OpenMP program generated by the 1619 /// compiler is as follows: 1620 /// 1621 /// 1. Create private copies of reduction variables on each OpenMP 1622 /// thread: 'foo_private', 'bar_private' 1623 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1624 /// to it and writes the result in 'foo_private' and 'bar_private' 1625 /// respectively. 1626 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1627 /// and store the result on the team master: 1628 /// 1629 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1630 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1631 /// 1632 /// where: 1633 /// struct ReduceData { 1634 /// double *foo; 1635 /// double *bar; 1636 /// } reduceData 1637 /// reduceData.foo = &foo_private 1638 /// reduceData.bar = &bar_private 1639 /// 1640 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1641 /// auxiliary functions generated by the compiler that operate on 1642 /// variables of type 'ReduceData'. They aid the runtime perform 1643 /// algorithmic steps in a data agnostic manner. 1644 /// 1645 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1646 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1647 /// same warp. It takes the following arguments as input: 1648 /// 1649 /// a. variable of type 'ReduceData' on the calling lane, 1650 /// b. its lane_id, 1651 /// c. an offset relative to the current lane_id to generate a 1652 /// remote_lane_id. The remote lane contains the second 1653 /// variable of type 'ReduceData' that is to be reduced. 1654 /// d. an algorithm version parameter determining which reduction 1655 /// algorithm to use. 1656 /// 1657 /// 'shuffleReduceFn' retrieves data from the remote lane using 1658 /// efficient GPU shuffle intrinsics and reduces, using the 1659 /// algorithm specified by the 4th parameter, the two operands 1660 /// element-wise. The result is written to the first operand. 1661 /// 1662 /// Different reduction algorithms are implemented in different 1663 /// runtime functions, all calling 'shuffleReduceFn' to perform 1664 /// the essential reduction step. Therefore, based on the 4th 1665 /// parameter, this function behaves slightly differently to 1666 /// cooperate with the runtime to ensure correctness under 1667 /// different circumstances. 1668 /// 1669 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1670 /// reduced variables across warps. It tunnels, through CUDA 1671 /// shared memory, the thread-private data of type 'ReduceData' 1672 /// from lane 0 of each warp to a lane in the first warp. 1673 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1674 /// The last team writes the global reduced value to memory. 1675 /// 1676 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1677 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1678 /// scratchpadCopyFn, loadAndReduceFn) 1679 /// 1680 /// 'scratchpadCopyFn' is a helper that stores reduced 1681 /// data from the team master to a scratchpad array in 1682 /// global memory. 1683 /// 1684 /// 'loadAndReduceFn' is a helper that loads data from 1685 /// the scratchpad array and reduces it with the input 1686 /// operand. 1687 /// 1688 /// These compiler generated functions hide address 1689 /// calculation and alignment information from the runtime. 1690 /// 5. if ret == 1: 1691 /// The team master of the last team stores the reduced 1692 /// result to the globals in memory. 1693 /// foo += reduceData.foo; bar *= reduceData.bar 1694 /// 1695 /// 1696 /// Warp Reduction Algorithms 1697 /// 1698 /// On the warp level, we have three algorithms implemented in the 1699 /// OpenMP runtime depending on the number of active lanes: 1700 /// 1701 /// Full Warp Reduction 1702 /// 1703 /// The reduce algorithm within a warp where all lanes are active 1704 /// is implemented in the runtime as follows: 1705 /// 1706 /// full_warp_reduce(void *reduce_data, 1707 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1708 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1709 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1710 /// } 1711 /// 1712 /// The algorithm completes in log(2, WARPSIZE) steps. 1713 /// 1714 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1715 /// not used therefore we save instructions by not retrieving lane_id 1716 /// from the corresponding special registers. The 4th parameter, which 1717 /// represents the version of the algorithm being used, is set to 0 to 1718 /// signify full warp reduction. 1719 /// 1720 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1721 /// 1722 /// #reduce_elem refers to an element in the local lane's data structure 1723 /// #remote_elem is retrieved from a remote lane 1724 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1725 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1726 /// 1727 /// Contiguous Partial Warp Reduction 1728 /// 1729 /// This reduce algorithm is used within a warp where only the first 1730 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1731 /// number of OpenMP threads in a parallel region is not a multiple of 1732 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1733 /// 1734 /// void 1735 /// contiguous_partial_reduce(void *reduce_data, 1736 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1737 /// int size, int lane_id) { 1738 /// int curr_size; 1739 /// int offset; 1740 /// curr_size = size; 1741 /// mask = curr_size/2; 1742 /// while (offset>0) { 1743 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1744 /// curr_size = (curr_size+1)/2; 1745 /// offset = curr_size/2; 1746 /// } 1747 /// } 1748 /// 1749 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1750 /// 1751 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1752 /// if (lane_id < offset) 1753 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1754 /// else 1755 /// reduce_elem = remote_elem 1756 /// 1757 /// This algorithm assumes that the data to be reduced are located in a 1758 /// contiguous subset of lanes starting from the first. When there is 1759 /// an odd number of active lanes, the data in the last lane is not 1760 /// aggregated with any other lane's dat but is instead copied over. 1761 /// 1762 /// Dispersed Partial Warp Reduction 1763 /// 1764 /// This algorithm is used within a warp when any discontiguous subset of 1765 /// lanes are active. It is used to implement the reduction operation 1766 /// across lanes in an OpenMP simd region or in a nested parallel region. 1767 /// 1768 /// void 1769 /// dispersed_partial_reduce(void *reduce_data, 1770 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1771 /// int size, remote_id; 1772 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1773 /// do { 1774 /// remote_id = next_active_lane_id_right_after_me(); 1775 /// # the above function returns 0 of no active lane 1776 /// # is present right after the current lane. 1777 /// size = number_of_active_lanes_in_this_warp(); 1778 /// logical_lane_id /= 2; 1779 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1780 /// remote_id-1-threadIdx.x, 2); 1781 /// } while (logical_lane_id % 2 == 0 && size > 1); 1782 /// } 1783 /// 1784 /// There is no assumption made about the initial state of the reduction. 1785 /// Any number of lanes (>=1) could be active at any position. The reduction 1786 /// result is returned in the first active lane. 1787 /// 1788 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1789 /// 1790 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1791 /// if (lane_id % 2 == 0 && offset > 0) 1792 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1793 /// else 1794 /// reduce_elem = remote_elem 1795 /// 1796 /// 1797 /// Intra-Team Reduction 1798 /// 1799 /// This function, as implemented in the runtime call 1800 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1801 /// threads in a team. It first reduces within a warp using the 1802 /// aforementioned algorithms. We then proceed to gather all such 1803 /// reduced values at the first warp. 1804 /// 1805 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1806 /// data from each of the "warp master" (zeroth lane of each warp, where 1807 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1808 /// a mathematical sense) the problem of reduction across warp masters in 1809 /// a block to the problem of warp reduction. 1810 /// 1811 /// 1812 /// Inter-Team Reduction 1813 /// 1814 /// Once a team has reduced its data to a single value, it is stored in 1815 /// a global scratchpad array. Since each team has a distinct slot, this 1816 /// can be done without locking. 1817 /// 1818 /// The last team to write to the scratchpad array proceeds to reduce the 1819 /// scratchpad array. One or more workers in the last team use the helper 1820 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1821 /// the k'th worker reduces every k'th element. 1822 /// 1823 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1824 /// reduce across workers and compute a globally reduced value. 1825 /// 1826 /// \param Loc The location where the reduction was 1827 /// encountered. Must be within the associate 1828 /// directive and after the last local access to the 1829 /// reduction variables. 1830 /// \param AllocaIP An insertion point suitable for allocas usable 1831 /// in reductions. 1832 /// \param CodeGenIP An insertion point suitable for code 1833 /// generation. \param ReductionInfos A list of info on each reduction 1834 /// variable. \param IsNoWait Optional flag set if the reduction is 1835 /// marked as 1836 /// nowait. 1837 /// \param IsTeamsReduction Optional flag set if it is a teams 1838 /// reduction. 1839 /// \param HasDistribute Optional flag set if it is a 1840 /// distribute reduction. 1841 /// \param GridValue Optional GPU grid value. 1842 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be 1843 /// used for teams reduction. 1844 /// \param SrcLocInfo Source location information global. 1845 InsertPointTy createReductionsGPU( 1846 const LocationDescription &Loc, InsertPointTy AllocaIP, 1847 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos, 1848 bool IsNoWait = false, bool IsTeamsReduction = false, 1849 bool HasDistribute = false, 1850 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, 1851 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024, 1852 Value *SrcLocInfo = nullptr); 1853 1854 // TODO: provide atomic and non-atomic reduction generators for reduction 1855 // operators defined by the OpenMP specification. 1856 1857 /// Generator for '#omp reduction'. 1858 /// 1859 /// Emits the IR instructing the runtime to perform the specific kind of 1860 /// reductions. Expects reduction variables to have been privatized and 1861 /// initialized to reduction-neutral values separately. Emits the calls to 1862 /// runtime functions as well as the reduction function and the basic blocks 1863 /// performing the reduction atomically and non-atomically. 1864 /// 1865 /// The code emitted for the following: 1866 /// 1867 /// \code 1868 /// type var_1; 1869 /// type var_2; 1870 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2) 1871 /// /* body */; 1872 /// \endcode 1873 /// 1874 /// corresponds to the following sketch. 1875 /// 1876 /// \code 1877 /// void _outlined_par() { 1878 /// // N is the number of different reductions. 1879 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...}; 1880 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array, 1881 /// _omp_reduction_func, 1882 /// _gomp_critical_user.reduction.var)) { 1883 /// case 1: { 1884 /// var_1 = var_1 <reduction-op> privatized_var_1; 1885 /// var_2 = var_2 <reduction-op> privatized_var_2; 1886 /// // ... 1887 /// __kmpc_end_reduce(...); 1888 /// break; 1889 /// } 1890 /// case 2: { 1891 /// _Atomic<ReductionOp>(var_1, privatized_var_1); 1892 /// _Atomic<ReductionOp>(var_2, privatized_var_2); 1893 /// // ... 1894 /// break; 1895 /// } 1896 /// default: break; 1897 /// } 1898 /// } 1899 /// 1900 /// void _omp_reduction_func(void **lhs, void **rhs) { 1901 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0]; 1902 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1]; 1903 /// // ... 1904 /// } 1905 /// \endcode 1906 /// 1907 /// \param Loc The location where the reduction was 1908 /// encountered. Must be within the associate 1909 /// directive and after the last local access to the 1910 /// reduction variables. 1911 /// \param AllocaIP An insertion point suitable for allocas usable 1912 /// in reductions. 1913 /// \param ReductionInfos A list of info on each reduction variable. 1914 /// \param IsNoWait A flag set if the reduction is marked as nowait. 1915 /// \param IsByRef A flag set if the reduction is using reference 1916 /// or direct value. 1917 InsertPointTy createReductions(const LocationDescription &Loc, 1918 InsertPointTy AllocaIP, 1919 ArrayRef<ReductionInfo> ReductionInfos, 1920 ArrayRef<bool> IsByRef, bool IsNoWait = false); 1921 1922 ///} 1923 1924 /// Return the insertion point used by the underlying IRBuilder. getInsertionPoint()1925 InsertPointTy getInsertionPoint() { return Builder.saveIP(); } 1926 1927 /// Update the internal location to \p Loc. updateToLocation(const LocationDescription & Loc)1928 bool updateToLocation(const LocationDescription &Loc) { 1929 Builder.restoreIP(Loc.IP); 1930 Builder.SetCurrentDebugLocation(Loc.DL); 1931 return Loc.IP.getBlock() != nullptr; 1932 } 1933 1934 /// Return the function declaration for the runtime function with \p FnID. 1935 FunctionCallee getOrCreateRuntimeFunction(Module &M, 1936 omp::RuntimeFunction FnID); 1937 1938 Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID); 1939 1940 /// Return the (LLVM-IR) string describing the source location \p LocStr. 1941 Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize); 1942 1943 /// Return the (LLVM-IR) string describing the default source location. 1944 Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize); 1945 1946 /// Return the (LLVM-IR) string describing the source location identified by 1947 /// the arguments. 1948 Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName, 1949 unsigned Line, unsigned Column, 1950 uint32_t &SrcLocStrSize); 1951 1952 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as 1953 /// fallback if \p DL does not specify the function name. 1954 Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize, 1955 Function *F = nullptr); 1956 1957 /// Return the (LLVM-IR) string describing the source location \p Loc. 1958 Constant *getOrCreateSrcLocStr(const LocationDescription &Loc, 1959 uint32_t &SrcLocStrSize); 1960 1961 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags. 1962 /// TODO: Create a enum class for the Reserve2Flags 1963 Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, 1964 omp::IdentFlag Flags = omp::IdentFlag(0), 1965 unsigned Reserve2Flags = 0); 1966 1967 /// Create a hidden global flag \p Name in the module with initial value \p 1968 /// Value. 1969 GlobalValue *createGlobalFlag(unsigned Value, StringRef Name); 1970 1971 /// Generate control flow and cleanup for cancellation. 1972 /// 1973 /// \param CancelFlag Flag indicating if the cancellation is performed. 1974 /// \param CanceledDirective The kind of directive that is cancled. 1975 /// \param ExitCB Extra code to be generated in the exit block. 1976 void emitCancelationCheckImpl(Value *CancelFlag, 1977 omp::Directive CanceledDirective, 1978 FinalizeCallbackTy ExitCB = {}); 1979 1980 /// Generate a target region entry call. 1981 /// 1982 /// \param Loc The location at which the request originated and is fulfilled. 1983 /// \param AllocaIP The insertion point to be used for alloca instructions. 1984 /// \param Return Return value of the created function returned by reference. 1985 /// \param DeviceID Identifier for the device via the 'device' clause. 1986 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause 1987 /// or 0 if unspecified and -1 if there is no 'teams' clause. 1988 /// \param NumThreads Number of threads via the 'thread_limit' clause. 1989 /// \param HostPtr Pointer to the host-side pointer of the target kernel. 1990 /// \param KernelArgs Array of arguments to the kernel. 1991 InsertPointTy emitTargetKernel(const LocationDescription &Loc, 1992 InsertPointTy AllocaIP, Value *&Return, 1993 Value *Ident, Value *DeviceID, Value *NumTeams, 1994 Value *NumThreads, Value *HostPtr, 1995 ArrayRef<Value *> KernelArgs); 1996 1997 /// Generate a flush runtime call. 1998 /// 1999 /// \param Loc The location at which the request originated and is fulfilled. 2000 void emitFlush(const LocationDescription &Loc); 2001 2002 /// The finalization stack made up of finalize callbacks currently in-flight, 2003 /// wrapped into FinalizationInfo objects that reference also the finalization 2004 /// target block and the kind of cancellable directive. 2005 SmallVector<FinalizationInfo, 8> FinalizationStack; 2006 2007 /// Return true if the last entry in the finalization stack is of kind \p DK 2008 /// and cancellable. isLastFinalizationInfoCancellable(omp::Directive DK)2009 bool isLastFinalizationInfoCancellable(omp::Directive DK) { 2010 return !FinalizationStack.empty() && 2011 FinalizationStack.back().IsCancellable && 2012 FinalizationStack.back().DK == DK; 2013 } 2014 2015 /// Generate a taskwait runtime call. 2016 /// 2017 /// \param Loc The location at which the request originated and is fulfilled. 2018 void emitTaskwaitImpl(const LocationDescription &Loc); 2019 2020 /// Generate a taskyield runtime call. 2021 /// 2022 /// \param Loc The location at which the request originated and is fulfilled. 2023 void emitTaskyieldImpl(const LocationDescription &Loc); 2024 2025 /// Return the current thread ID. 2026 /// 2027 /// \param Ident The ident (ident_t*) describing the query origin. 2028 Value *getOrCreateThreadID(Value *Ident); 2029 2030 /// The OpenMPIRBuilder Configuration 2031 OpenMPIRBuilderConfig Config; 2032 2033 /// The underlying LLVM-IR module 2034 Module &M; 2035 2036 /// The LLVM-IR Builder used to create IR. 2037 IRBuilder<> Builder; 2038 2039 /// Map to remember source location strings 2040 StringMap<Constant *> SrcLocStrMap; 2041 2042 /// Map to remember existing ident_t*. 2043 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap; 2044 2045 /// Info manager to keep track of target regions. 2046 OffloadEntriesInfoManager OffloadInfoManager; 2047 2048 /// The target triple of the underlying module. 2049 const Triple T; 2050 2051 /// Helper that contains information about regions we need to outline 2052 /// during finalization. 2053 struct OutlineInfo { 2054 using PostOutlineCBTy = std::function<void(Function &)>; 2055 PostOutlineCBTy PostOutlineCB; 2056 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB; 2057 SmallVector<Value *, 2> ExcludeArgsFromAggregate; 2058 2059 /// Collect all blocks in between EntryBB and ExitBB in both the given 2060 /// vector and set. 2061 void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet, 2062 SmallVectorImpl<BasicBlock *> &BlockVector); 2063 2064 /// Return the function that contains the region to be outlined. getFunctionOutlineInfo2065 Function *getFunction() const { return EntryBB->getParent(); } 2066 }; 2067 2068 /// Collection of regions that need to be outlined during finalization. 2069 SmallVector<OutlineInfo, 16> OutlineInfos; 2070 2071 /// A collection of candidate target functions that's constant allocas will 2072 /// attempt to be raised on a call of finalize after all currently enqueued 2073 /// outline info's have been processed. 2074 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates; 2075 2076 /// Collection of owned canonical loop objects that eventually need to be 2077 /// free'd. 2078 std::forward_list<CanonicalLoopInfo> LoopInfos; 2079 2080 /// Add a new region that will be outlined later. addOutlineInfo(OutlineInfo && OI)2081 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); } 2082 2083 /// An ordered map of auto-generated variables to their unique names. 2084 /// It stores variables with the following names: 1) ".gomp_critical_user_" + 2085 /// <critical_section_name> + ".var" for "omp critical" directives; 2) 2086 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate 2087 /// variables. 2088 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars; 2089 2090 /// Computes the size of type in bytes. 2091 Value *getSizeInBytes(Value *BasePtr); 2092 2093 // Emit a branch from the current block to the Target block only if 2094 // the current block has a terminator. 2095 void emitBranch(BasicBlock *Target); 2096 2097 // If BB has no use then delete it and return. Else place BB after the current 2098 // block, if possible, or else at the end of the function. Also add a branch 2099 // from current block to BB if current block does not have a terminator. 2100 void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false); 2101 2102 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy 2103 /// Here is the logic: 2104 /// if (Cond) { 2105 /// ThenGen(); 2106 /// } else { 2107 /// ElseGen(); 2108 /// } 2109 void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, 2110 BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {}); 2111 2112 /// Create the global variable holding the offload mappings information. 2113 GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 2114 std::string VarName); 2115 2116 /// Create the global variable holding the offload names information. 2117 GlobalVariable * 2118 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 2119 std::string VarName); 2120 2121 struct MapperAllocas { 2122 AllocaInst *ArgsBase = nullptr; 2123 AllocaInst *Args = nullptr; 2124 AllocaInst *ArgSizes = nullptr; 2125 }; 2126 2127 /// Create the allocas instruction used in call to mapper functions. 2128 void createMapperAllocas(const LocationDescription &Loc, 2129 InsertPointTy AllocaIP, unsigned NumOperands, 2130 struct MapperAllocas &MapperAllocas); 2131 2132 /// Create the call for the target mapper function. 2133 /// \param Loc The source location description. 2134 /// \param MapperFunc Function to be called. 2135 /// \param SrcLocInfo Source location information global. 2136 /// \param MaptypesArg The argument types. 2137 /// \param MapnamesArg The argument names. 2138 /// \param MapperAllocas The AllocaInst used for the call. 2139 /// \param DeviceID Device ID for the call. 2140 /// \param NumOperands Number of operands in the call. 2141 void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, 2142 Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, 2143 struct MapperAllocas &MapperAllocas, int64_t DeviceID, 2144 unsigned NumOperands); 2145 2146 /// Container for the arguments used to pass data to the runtime library. 2147 struct TargetDataRTArgs { 2148 /// The array of base pointer passed to the runtime library. 2149 Value *BasePointersArray = nullptr; 2150 /// The array of section pointers passed to the runtime library. 2151 Value *PointersArray = nullptr; 2152 /// The array of sizes passed to the runtime library. 2153 Value *SizesArray = nullptr; 2154 /// The array of map types passed to the runtime library for the beginning 2155 /// of the region or for the entire region if there are no separate map 2156 /// types for the region end. 2157 Value *MapTypesArray = nullptr; 2158 /// The array of map types passed to the runtime library for the end of the 2159 /// region, or nullptr if there are no separate map types for the region 2160 /// end. 2161 Value *MapTypesArrayEnd = nullptr; 2162 /// The array of user-defined mappers passed to the runtime library. 2163 Value *MappersArray = nullptr; 2164 /// The array of original declaration names of mapped pointers sent to the 2165 /// runtime library for debugging 2166 Value *MapNamesArray = nullptr; 2167 TargetDataRTArgsTargetDataRTArgs2168 explicit TargetDataRTArgs() {} TargetDataRTArgsTargetDataRTArgs2169 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, 2170 Value *SizesArray, Value *MapTypesArray, 2171 Value *MapTypesArrayEnd, Value *MappersArray, 2172 Value *MapNamesArray) 2173 : BasePointersArray(BasePointersArray), PointersArray(PointersArray), 2174 SizesArray(SizesArray), MapTypesArray(MapTypesArray), 2175 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray), 2176 MapNamesArray(MapNamesArray) {} 2177 }; 2178 2179 /// Data structure that contains the needed information to construct the 2180 /// kernel args vector. 2181 struct TargetKernelArgs { 2182 /// Number of arguments passed to the runtime library. 2183 unsigned NumTargetItems; 2184 /// Arguments passed to the runtime library 2185 TargetDataRTArgs RTArgs; 2186 /// The number of iterations 2187 Value *NumIterations; 2188 /// The number of teams. 2189 Value *NumTeams; 2190 /// The number of threads. 2191 Value *NumThreads; 2192 /// The size of the dynamic shared memory. 2193 Value *DynCGGroupMem; 2194 /// True if the kernel has 'no wait' clause. 2195 bool HasNoWait; 2196 2197 /// Constructor for TargetKernelArgs TargetKernelArgsTargetKernelArgs2198 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, 2199 Value *NumIterations, Value *NumTeams, Value *NumThreads, 2200 Value *DynCGGroupMem, bool HasNoWait) 2201 : NumTargetItems(NumTargetItems), RTArgs(RTArgs), 2202 NumIterations(NumIterations), NumTeams(NumTeams), 2203 NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem), 2204 HasNoWait(HasNoWait) {} 2205 }; 2206 2207 /// Create the kernel args vector used by emitTargetKernel. This function 2208 /// creates various constant values that are used in the resulting args 2209 /// vector. 2210 static void getKernelArgsVector(TargetKernelArgs &KernelArgs, 2211 IRBuilderBase &Builder, 2212 SmallVector<Value *> &ArgsVector); 2213 2214 /// Struct that keeps the information that should be kept throughout 2215 /// a 'target data' region. 2216 class TargetDataInfo { 2217 /// Set to true if device pointer information have to be obtained. 2218 bool RequiresDevicePointerInfo = false; 2219 /// Set to true if Clang emits separate runtime calls for the beginning and 2220 /// end of the region. These calls might have separate map type arrays. 2221 bool SeparateBeginEndCalls = false; 2222 2223 public: 2224 TargetDataRTArgs RTArgs; 2225 2226 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4> 2227 DevicePtrInfoMap; 2228 2229 /// Indicate whether any user-defined mapper exists. 2230 bool HasMapper = false; 2231 /// The total number of pointers passed to the runtime library. 2232 unsigned NumberOfPtrs = 0u; 2233 TargetDataInfo()2234 explicit TargetDataInfo() {} TargetDataInfo(bool RequiresDevicePointerInfo,bool SeparateBeginEndCalls)2235 explicit TargetDataInfo(bool RequiresDevicePointerInfo, 2236 bool SeparateBeginEndCalls) 2237 : RequiresDevicePointerInfo(RequiresDevicePointerInfo), 2238 SeparateBeginEndCalls(SeparateBeginEndCalls) {} 2239 /// Clear information about the data arrays. clearArrayInfo()2240 void clearArrayInfo() { 2241 RTArgs = TargetDataRTArgs(); 2242 HasMapper = false; 2243 NumberOfPtrs = 0u; 2244 } 2245 /// Return true if the current target data information has valid arrays. isValid()2246 bool isValid() { 2247 return RTArgs.BasePointersArray && RTArgs.PointersArray && 2248 RTArgs.SizesArray && RTArgs.MapTypesArray && 2249 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs; 2250 } requiresDevicePointerInfo()2251 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; } separateBeginEndCalls()2252 bool separateBeginEndCalls() { return SeparateBeginEndCalls; } 2253 }; 2254 2255 enum class DeviceInfoTy { None, Pointer, Address }; 2256 using MapValuesArrayTy = SmallVector<Value *, 4>; 2257 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>; 2258 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>; 2259 using MapNamesArrayTy = SmallVector<Constant *, 4>; 2260 using MapDimArrayTy = SmallVector<uint64_t, 4>; 2261 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>; 2262 2263 /// This structure contains combined information generated for mappable 2264 /// clauses, including base pointers, pointers, sizes, map types, user-defined 2265 /// mappers, and non-contiguous information. 2266 struct MapInfosTy { 2267 struct StructNonContiguousInfo { 2268 bool IsNonContiguous = false; 2269 MapDimArrayTy Dims; 2270 MapNonContiguousArrayTy Offsets; 2271 MapNonContiguousArrayTy Counts; 2272 MapNonContiguousArrayTy Strides; 2273 }; 2274 MapValuesArrayTy BasePointers; 2275 MapValuesArrayTy Pointers; 2276 MapDeviceInfoArrayTy DevicePointers; 2277 MapValuesArrayTy Sizes; 2278 MapFlagsArrayTy Types; 2279 MapNamesArrayTy Names; 2280 StructNonContiguousInfo NonContigInfo; 2281 2282 /// Append arrays in \a CurInfo. appendMapInfosTy2283 void append(MapInfosTy &CurInfo) { 2284 BasePointers.append(CurInfo.BasePointers.begin(), 2285 CurInfo.BasePointers.end()); 2286 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end()); 2287 DevicePointers.append(CurInfo.DevicePointers.begin(), 2288 CurInfo.DevicePointers.end()); 2289 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end()); 2290 Types.append(CurInfo.Types.begin(), CurInfo.Types.end()); 2291 Names.append(CurInfo.Names.begin(), CurInfo.Names.end()); 2292 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(), 2293 CurInfo.NonContigInfo.Dims.end()); 2294 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(), 2295 CurInfo.NonContigInfo.Offsets.end()); 2296 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(), 2297 CurInfo.NonContigInfo.Counts.end()); 2298 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(), 2299 CurInfo.NonContigInfo.Strides.end()); 2300 } 2301 }; 2302 2303 /// Callback function type for functions emitting the host fallback code that 2304 /// is executed when the kernel launch fails. It takes an insertion point as 2305 /// parameter where the code should be emitted. It returns an insertion point 2306 /// that points right after after the emitted code. 2307 using EmitFallbackCallbackTy = function_ref<InsertPointTy(InsertPointTy)>; 2308 2309 /// Generate a target region entry call and host fallback call. 2310 /// 2311 /// \param Loc The location at which the request originated and is fulfilled. 2312 /// \param OutlinedFn The outlined kernel function. 2313 /// \param OutlinedFnID The ooulined function ID. 2314 /// \param EmitTargetCallFallbackCB Call back function to generate host 2315 /// fallback code. 2316 /// \param Args Data structure holding information about the kernel arguments. 2317 /// \param DeviceID Identifier for the device via the 'device' clause. 2318 /// \param RTLoc Source location identifier 2319 /// \param AllocaIP The insertion point to be used for alloca instructions. 2320 InsertPointTy emitKernelLaunch( 2321 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, 2322 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, 2323 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP); 2324 2325 /// Generate a target-task for the target construct 2326 /// 2327 /// \param OutlinedFn The outlined device/target kernel function. 2328 /// \param OutlinedFnID The ooulined function ID. 2329 /// \param EmitTargetCallFallbackCB Call back function to generate host 2330 /// fallback code. 2331 /// \param Args Data structure holding information about the kernel arguments. 2332 /// \param DeviceID Identifier for the device via the 'device' clause. 2333 /// \param RTLoc Source location identifier 2334 /// \param AllocaIP The insertion point to be used for alloca instructions. 2335 /// \param Dependencies Vector of DependData objects holding information of 2336 /// dependencies as specified by the 'depend' clause. 2337 /// \param HasNoWait True if the target construct had 'nowait' on it, false 2338 /// otherwise 2339 InsertPointTy emitTargetTask( 2340 Function *OutlinedFn, Value *OutlinedFnID, 2341 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, 2342 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, 2343 SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait); 2344 2345 /// Emit the arguments to be passed to the runtime library based on the 2346 /// arrays of base pointers, pointers, sizes, map types, and mappers. If 2347 /// ForEndCall, emit map types to be passed for the end of the region instead 2348 /// of the beginning. 2349 void emitOffloadingArraysArgument(IRBuilderBase &Builder, 2350 OpenMPIRBuilder::TargetDataRTArgs &RTArgs, 2351 OpenMPIRBuilder::TargetDataInfo &Info, 2352 bool EmitDebug = false, 2353 bool ForEndCall = false); 2354 2355 /// Emit an array of struct descriptors to be assigned to the offload args. 2356 void emitNonContiguousDescriptor(InsertPointTy AllocaIP, 2357 InsertPointTy CodeGenIP, 2358 MapInfosTy &CombinedInfo, 2359 TargetDataInfo &Info); 2360 2361 /// Emit the arrays used to pass the captures and map information to the 2362 /// offloading runtime library. If there is no map or capture information, 2363 /// return nullptr by reference. 2364 void emitOffloadingArrays( 2365 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, 2366 TargetDataInfo &Info, bool IsNonContiguous = false, 2367 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 2368 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr); 2369 2370 /// Creates offloading entry for the provided entry ID \a ID, address \a 2371 /// Addr, size \a Size, and flags \a Flags. 2372 void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, 2373 int32_t Flags, GlobalValue::LinkageTypes, 2374 StringRef Name = ""); 2375 2376 /// The kind of errors that can occur when emitting the offload entries and 2377 /// metadata. 2378 enum EmitMetadataErrorKind { 2379 EMIT_MD_TARGET_REGION_ERROR, 2380 EMIT_MD_DECLARE_TARGET_ERROR, 2381 EMIT_MD_GLOBAL_VAR_LINK_ERROR 2382 }; 2383 2384 /// Callback function type 2385 using EmitMetadataErrorReportFunctionTy = 2386 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>; 2387 2388 // Emit the offloading entries and metadata so that the device codegen side 2389 // can easily figure out what to emit. The produced metadata looks like 2390 // this: 2391 // 2392 // !omp_offload.info = !{!1, ...} 2393 // 2394 // We only generate metadata for function that contain target regions. 2395 void createOffloadEntriesAndInfoMetadata( 2396 EmitMetadataErrorReportFunctionTy &ErrorReportFunction); 2397 2398 public: 2399 /// Generator for __kmpc_copyprivate 2400 /// 2401 /// \param Loc The source location description. 2402 /// \param BufSize Number of elements in the buffer. 2403 /// \param CpyBuf List of pointers to data to be copied. 2404 /// \param CpyFn function to call for copying data. 2405 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise. 2406 /// 2407 /// \return The insertion position *after* the CopyPrivate call. 2408 2409 InsertPointTy createCopyPrivate(const LocationDescription &Loc, 2410 llvm::Value *BufSize, llvm::Value *CpyBuf, 2411 llvm::Value *CpyFn, llvm::Value *DidIt); 2412 2413 /// Generator for '#omp single' 2414 /// 2415 /// \param Loc The source location description. 2416 /// \param BodyGenCB Callback that will generate the region code. 2417 /// \param FiniCB Callback to finalize variable copies. 2418 /// \param IsNowait If false, a barrier is emitted. 2419 /// \param CPVars copyprivate variables. 2420 /// \param CPFuncs copy functions to use for each copyprivate variable. 2421 /// 2422 /// \returns The insertion position *after* the single call. 2423 InsertPointTy createSingle(const LocationDescription &Loc, 2424 BodyGenCallbackTy BodyGenCB, 2425 FinalizeCallbackTy FiniCB, bool IsNowait, 2426 ArrayRef<llvm::Value *> CPVars = {}, 2427 ArrayRef<llvm::Function *> CPFuncs = {}); 2428 2429 /// Generator for '#omp master' 2430 /// 2431 /// \param Loc The insert and source location description. 2432 /// \param BodyGenCB Callback that will generate the region code. 2433 /// \param FiniCB Callback to finalize variable copies. 2434 /// 2435 /// \returns The insertion position *after* the master. 2436 InsertPointTy createMaster(const LocationDescription &Loc, 2437 BodyGenCallbackTy BodyGenCB, 2438 FinalizeCallbackTy FiniCB); 2439 2440 /// Generator for '#omp masked' 2441 /// 2442 /// \param Loc The insert and source location description. 2443 /// \param BodyGenCB Callback that will generate the region code. 2444 /// \param FiniCB Callback to finialize variable copies. 2445 /// 2446 /// \returns The insertion position *after* the masked. 2447 InsertPointTy createMasked(const LocationDescription &Loc, 2448 BodyGenCallbackTy BodyGenCB, 2449 FinalizeCallbackTy FiniCB, Value *Filter); 2450 2451 /// Generator for '#omp critical' 2452 /// 2453 /// \param Loc The insert and source location description. 2454 /// \param BodyGenCB Callback that will generate the region body code. 2455 /// \param FiniCB Callback to finalize variable copies. 2456 /// \param CriticalName name of the lock used by the critical directive 2457 /// \param HintInst Hint Instruction for hint clause associated with critical 2458 /// 2459 /// \returns The insertion position *after* the critical. 2460 InsertPointTy createCritical(const LocationDescription &Loc, 2461 BodyGenCallbackTy BodyGenCB, 2462 FinalizeCallbackTy FiniCB, 2463 StringRef CriticalName, Value *HintInst); 2464 2465 /// Generator for '#omp ordered depend (source | sink)' 2466 /// 2467 /// \param Loc The insert and source location description. 2468 /// \param AllocaIP The insertion point to be used for alloca instructions. 2469 /// \param NumLoops The number of loops in depend clause. 2470 /// \param StoreValues The value will be stored in vector address. 2471 /// \param Name The name of alloca instruction. 2472 /// \param IsDependSource If true, depend source; otherwise, depend sink. 2473 /// 2474 /// \return The insertion position *after* the ordered. 2475 InsertPointTy createOrderedDepend(const LocationDescription &Loc, 2476 InsertPointTy AllocaIP, unsigned NumLoops, 2477 ArrayRef<llvm::Value *> StoreValues, 2478 const Twine &Name, bool IsDependSource); 2479 2480 /// Generator for '#omp ordered [threads | simd]' 2481 /// 2482 /// \param Loc The insert and source location description. 2483 /// \param BodyGenCB Callback that will generate the region code. 2484 /// \param FiniCB Callback to finalize variable copies. 2485 /// \param IsThreads If true, with threads clause or without clause; 2486 /// otherwise, with simd clause; 2487 /// 2488 /// \returns The insertion position *after* the ordered. 2489 InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, 2490 BodyGenCallbackTy BodyGenCB, 2491 FinalizeCallbackTy FiniCB, 2492 bool IsThreads); 2493 2494 /// Generator for '#omp sections' 2495 /// 2496 /// \param Loc The insert and source location description. 2497 /// \param AllocaIP The insertion points to be used for alloca instructions. 2498 /// \param SectionCBs Callbacks that will generate body of each section. 2499 /// \param PrivCB Callback to copy a given variable (think copy constructor). 2500 /// \param FiniCB Callback to finalize variable copies. 2501 /// \param IsCancellable Flag to indicate a cancellable parallel region. 2502 /// \param IsNowait If true, barrier - to ensure all sections are executed 2503 /// before moving forward will not be generated. 2504 /// \returns The insertion position *after* the sections. 2505 InsertPointTy createSections(const LocationDescription &Loc, 2506 InsertPointTy AllocaIP, 2507 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, 2508 PrivatizeCallbackTy PrivCB, 2509 FinalizeCallbackTy FiniCB, bool IsCancellable, 2510 bool IsNowait); 2511 2512 /// Generator for '#omp section' 2513 /// 2514 /// \param Loc The insert and source location description. 2515 /// \param BodyGenCB Callback that will generate the region body code. 2516 /// \param FiniCB Callback to finalize variable copies. 2517 /// \returns The insertion position *after* the section. 2518 InsertPointTy createSection(const LocationDescription &Loc, 2519 BodyGenCallbackTy BodyGenCB, 2520 FinalizeCallbackTy FiniCB); 2521 2522 /// Generator for `#omp teams` 2523 /// 2524 /// \param Loc The location where the teams construct was encountered. 2525 /// \param BodyGenCB Callback that will generate the region code. 2526 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr, 2527 /// it is as if lower bound is specified as equal to upperbound. If 2528 /// this is non-null, then upperbound must also be non-null. 2529 /// \param NumTeamsUpper Upper bound on the number of teams. 2530 /// \param ThreadLimit on the number of threads that may participate in a 2531 /// contention group created by each team. 2532 /// \param IfExpr is the integer argument value of the if condition on the 2533 /// teams clause. 2534 InsertPointTy 2535 createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 2536 Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr, 2537 Value *ThreadLimit = nullptr, Value *IfExpr = nullptr); 2538 2539 /// Generate conditional branch and relevant BasicBlocks through which private 2540 /// threads copy the 'copyin' variables from Master copy to threadprivate 2541 /// copies. 2542 /// 2543 /// \param IP insertion block for copyin conditional 2544 /// \param MasterVarPtr a pointer to the master variable 2545 /// \param PrivateVarPtr a pointer to the threadprivate variable 2546 /// \param IntPtrTy Pointer size type 2547 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks 2548 // and copy.in.end block 2549 /// 2550 /// \returns The insertion point where copying operation to be emitted. 2551 InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, 2552 Value *PrivateAddr, 2553 llvm::IntegerType *IntPtrTy, 2554 bool BranchtoEnd = true); 2555 2556 /// Create a runtime call for kmpc_Alloc 2557 /// 2558 /// \param Loc The insert and source location description. 2559 /// \param Size Size of allocated memory space 2560 /// \param Allocator Allocator information instruction 2561 /// \param Name Name of call Instruction for OMP_alloc 2562 /// 2563 /// \returns CallInst to the OMP_Alloc call 2564 CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size, 2565 Value *Allocator, std::string Name = ""); 2566 2567 /// Create a runtime call for kmpc_free 2568 /// 2569 /// \param Loc The insert and source location description. 2570 /// \param Addr Address of memory space to be freed 2571 /// \param Allocator Allocator information instruction 2572 /// \param Name Name of call Instruction for OMP_Free 2573 /// 2574 /// \returns CallInst to the OMP_Free call 2575 CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr, 2576 Value *Allocator, std::string Name = ""); 2577 2578 /// Create a runtime call for kmpc_threadprivate_cached 2579 /// 2580 /// \param Loc The insert and source location description. 2581 /// \param Pointer pointer to data to be cached 2582 /// \param Size size of data to be cached 2583 /// \param Name Name of call Instruction for callinst 2584 /// 2585 /// \returns CallInst to the thread private cache call. 2586 CallInst *createCachedThreadPrivate(const LocationDescription &Loc, 2587 llvm::Value *Pointer, 2588 llvm::ConstantInt *Size, 2589 const llvm::Twine &Name = Twine("")); 2590 2591 /// Create a runtime call for __tgt_interop_init 2592 /// 2593 /// \param Loc The insert and source location description. 2594 /// \param InteropVar variable to be allocated 2595 /// \param InteropType type of interop operation 2596 /// \param Device devide to which offloading will occur 2597 /// \param NumDependences number of dependence variables 2598 /// \param DependenceAddress pointer to dependence variables 2599 /// \param HaveNowaitClause does nowait clause exist 2600 /// 2601 /// \returns CallInst to the __tgt_interop_init call 2602 CallInst *createOMPInteropInit(const LocationDescription &Loc, 2603 Value *InteropVar, 2604 omp::OMPInteropType InteropType, Value *Device, 2605 Value *NumDependences, 2606 Value *DependenceAddress, 2607 bool HaveNowaitClause); 2608 2609 /// Create a runtime call for __tgt_interop_destroy 2610 /// 2611 /// \param Loc The insert and source location description. 2612 /// \param InteropVar variable to be allocated 2613 /// \param Device devide to which offloading will occur 2614 /// \param NumDependences number of dependence variables 2615 /// \param DependenceAddress pointer to dependence variables 2616 /// \param HaveNowaitClause does nowait clause exist 2617 /// 2618 /// \returns CallInst to the __tgt_interop_destroy call 2619 CallInst *createOMPInteropDestroy(const LocationDescription &Loc, 2620 Value *InteropVar, Value *Device, 2621 Value *NumDependences, 2622 Value *DependenceAddress, 2623 bool HaveNowaitClause); 2624 2625 /// Create a runtime call for __tgt_interop_use 2626 /// 2627 /// \param Loc The insert and source location description. 2628 /// \param InteropVar variable to be allocated 2629 /// \param Device devide to which offloading will occur 2630 /// \param NumDependences number of dependence variables 2631 /// \param DependenceAddress pointer to dependence variables 2632 /// \param HaveNowaitClause does nowait clause exist 2633 /// 2634 /// \returns CallInst to the __tgt_interop_use call 2635 CallInst *createOMPInteropUse(const LocationDescription &Loc, 2636 Value *InteropVar, Value *Device, 2637 Value *NumDependences, Value *DependenceAddress, 2638 bool HaveNowaitClause); 2639 2640 /// The `omp target` interface 2641 /// 2642 /// For more information about the usage of this interface, 2643 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h 2644 /// 2645 ///{ 2646 2647 /// Create a runtime call for kmpc_target_init 2648 /// 2649 /// \param Loc The insert and source location description. 2650 /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not. 2651 /// \param MinThreads Minimal number of threads, or 0. 2652 /// \param MaxThreads Maximal number of threads, or 0. 2653 /// \param MinTeams Minimal number of teams, or 0. 2654 /// \param MaxTeams Maximal number of teams, or 0. 2655 InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, 2656 int32_t MinThreadsVal = 0, 2657 int32_t MaxThreadsVal = 0, 2658 int32_t MinTeamsVal = 0, 2659 int32_t MaxTeamsVal = 0); 2660 2661 /// Create a runtime call for kmpc_target_deinit 2662 /// 2663 /// \param Loc The insert and source location description. 2664 /// \param TeamsReductionDataSize The maximal size of all the reduction data 2665 /// for teams reduction. 2666 /// \param TeamsReductionBufferLength The number of elements (each of up to 2667 /// \p TeamsReductionDataSize size), in the teams reduction buffer. 2668 void createTargetDeinit(const LocationDescription &Loc, 2669 int32_t TeamsReductionDataSize = 0, 2670 int32_t TeamsReductionBufferLength = 1024); 2671 2672 ///} 2673 2674 /// Helpers to read/write kernel annotations from the IR. 2675 /// 2676 ///{ 2677 2678 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none 2679 /// is set. 2680 static std::pair<int32_t, int32_t> 2681 readThreadBoundsForKernel(const Triple &T, Function &Kernel); 2682 static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, 2683 int32_t LB, int32_t UB); 2684 2685 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none 2686 /// is set. 2687 static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T, 2688 Function &Kernel); 2689 static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, 2690 int32_t UB); 2691 ///} 2692 2693 private: 2694 // Sets the function attributes expected for the outlined function 2695 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn); 2696 2697 // Creates the function ID/Address for the given outlined function. 2698 // In the case of an embedded device function the address of the function is 2699 // used, in the case of a non-offload function a constant is created. 2700 Constant *createOutlinedFunctionID(Function *OutlinedFn, 2701 StringRef EntryFnIDName); 2702 2703 // Creates the region entry address for the outlined function 2704 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction, 2705 StringRef EntryFnName); 2706 2707 public: 2708 /// Functions used to generate a function with the given name. 2709 using FunctionGenCallback = std::function<Function *(StringRef FunctionName)>; 2710 2711 /// Create a unique name for the entry function using the source location 2712 /// information of the current target region. The name will be something like: 2713 /// 2714 /// __omp_offloading_DD_FFFF_PP_lBB[_CC] 2715 /// 2716 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the 2717 /// mangled name of the function that encloses the target region and BB is the 2718 /// line number of the target region. CC is a count added when more than one 2719 /// region is located at the same location. 2720 /// 2721 /// If this target outline function is not an offload entry, we don't need to 2722 /// register it. This may happen if it is guarded by an if clause that is 2723 /// false at compile time, or no target archs have been specified. 2724 /// 2725 /// The created target region ID is used by the runtime library to identify 2726 /// the current target region, so it only has to be unique and not 2727 /// necessarily point to anything. It could be the pointer to the outlined 2728 /// function that implements the target region, but we aren't using that so 2729 /// that the compiler doesn't need to keep that, and could therefore inline 2730 /// the host function if proven worthwhile during optimization. In the other 2731 /// hand, if emitting code for the device, the ID has to be the function 2732 /// address so that it can retrieved from the offloading entry and launched 2733 /// by the runtime library. We also mark the outlined function to have 2734 /// external linkage in case we are emitting code for the device, because 2735 /// these functions will be entry points to the device. 2736 /// 2737 /// \param InfoManager The info manager keeping track of the offload entries 2738 /// \param EntryInfo The entry information about the function 2739 /// \param GenerateFunctionCallback The callback function to generate the code 2740 /// \param OutlinedFunction Pointer to the outlined function 2741 /// \param EntryFnIDName Name of the ID o be created 2742 void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, 2743 FunctionGenCallback &GenerateFunctionCallback, 2744 bool IsOffloadEntry, Function *&OutlinedFn, 2745 Constant *&OutlinedFnID); 2746 2747 /// Registers the given function and sets up the attribtues of the function 2748 /// Returns the FunctionID. 2749 /// 2750 /// \param InfoManager The info manager keeping track of the offload entries 2751 /// \param EntryInfo The entry information about the function 2752 /// \param OutlinedFunction Pointer to the outlined function 2753 /// \param EntryFnName Name of the outlined function 2754 /// \param EntryFnIDName Name of the ID o be created 2755 Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, 2756 Function *OutlinedFunction, 2757 StringRef EntryFnName, 2758 StringRef EntryFnIDName); 2759 2760 /// Type of BodyGen to use for region codegen 2761 /// 2762 /// Priv: If device pointer privatization is required, emit the body of the 2763 /// region here. It will have to be duplicated: with and without 2764 /// privatization. 2765 /// DupNoPriv: If we need device pointer privatization, we need 2766 /// to emit the body of the region with no privatization in the 'else' branch 2767 /// of the conditional. 2768 /// NoPriv: If we don't require privatization of device 2769 /// pointers, we emit the body in between the runtime calls. This avoids 2770 /// duplicating the body code. 2771 enum BodyGenTy { Priv, DupNoPriv, NoPriv }; 2772 2773 /// Callback type for creating the map infos for the kernel parameters. 2774 /// \param CodeGenIP is the insertion point where code should be generated, 2775 /// if any. 2776 using GenMapInfoCallbackTy = 2777 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>; 2778 2779 /// Generator for '#omp target data' 2780 /// 2781 /// \param Loc The location where the target data construct was encountered. 2782 /// \param AllocaIP The insertion points to be used for alloca instructions. 2783 /// \param CodeGenIP The insertion point at which the target directive code 2784 /// should be placed. 2785 /// \param IsBegin If true then emits begin mapper call otherwise emits 2786 /// end mapper call. 2787 /// \param DeviceID Stores the DeviceID from the device clause. 2788 /// \param IfCond Value which corresponds to the if clause condition. 2789 /// \param Info Stores all information realted to the Target Data directive. 2790 /// \param GenMapInfoCB Callback that populates the MapInfos and returns. 2791 /// \param BodyGenCB Optional Callback to generate the region code. 2792 /// \param DeviceAddrCB Optional callback to generate code related to 2793 /// use_device_ptr and use_device_addr. 2794 /// \param CustomMapperCB Optional callback to generate code related to 2795 /// custom mappers. 2796 OpenMPIRBuilder::InsertPointTy createTargetData( 2797 const LocationDescription &Loc, InsertPointTy AllocaIP, 2798 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, 2799 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, 2800 omp::RuntimeFunction *MapperFunc = nullptr, 2801 function_ref<InsertPointTy(InsertPointTy CodeGenIP, 2802 BodyGenTy BodyGenType)> 2803 BodyGenCB = nullptr, 2804 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr, 2805 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr, 2806 Value *SrcLocInfo = nullptr); 2807 2808 using TargetBodyGenCallbackTy = function_ref<InsertPointTy( 2809 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>; 2810 2811 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointTy( 2812 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, 2813 InsertPointTy CodeGenIP)>; 2814 2815 /// Generator for '#omp target' 2816 /// 2817 /// \param Loc where the target data construct was encountered. 2818 /// \param CodeGenIP The insertion point where the call to the outlined 2819 /// function should be emitted. 2820 /// \param EntryInfo The entry information about the function. 2821 /// \param NumTeams Number of teams specified in the num_teams clause. 2822 /// \param NumThreads Number of teams specified in the thread_limit clause. 2823 /// \param Inputs The input values to the region that will be passed. 2824 /// as arguments to the outlined function. 2825 /// \param BodyGenCB Callback that will generate the region code. 2826 /// \param ArgAccessorFuncCB Callback that will generate accessors 2827 /// instructions for passed in target arguments where neccessary 2828 /// \param Dependencies A vector of DependData objects that carry 2829 // dependency information as passed in the depend clause 2830 InsertPointTy createTarget(const LocationDescription &Loc, 2831 OpenMPIRBuilder::InsertPointTy AllocaIP, 2832 OpenMPIRBuilder::InsertPointTy CodeGenIP, 2833 TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, 2834 int32_t NumThreads, 2835 SmallVectorImpl<Value *> &Inputs, 2836 GenMapInfoCallbackTy GenMapInfoCB, 2837 TargetBodyGenCallbackTy BodyGenCB, 2838 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, 2839 SmallVector<DependData> Dependencies = {}); 2840 2841 /// Returns __kmpc_for_static_init_* runtime function for the specified 2842 /// size \a IVSize and sign \a IVSigned. Will create a distribute call 2843 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set. 2844 FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, 2845 bool IsGPUDistribute); 2846 2847 /// Returns __kmpc_dispatch_init_* runtime function for the specified 2848 /// size \a IVSize and sign \a IVSigned. 2849 FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned); 2850 2851 /// Returns __kmpc_dispatch_next_* runtime function for the specified 2852 /// size \a IVSize and sign \a IVSigned. 2853 FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned); 2854 2855 /// Returns __kmpc_dispatch_fini_* runtime function for the specified 2856 /// size \a IVSize and sign \a IVSigned. 2857 FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned); 2858 2859 /// Returns __kmpc_dispatch_deinit runtime function. 2860 FunctionCallee createDispatchDeinitFunction(); 2861 2862 /// Declarations for LLVM-IR types (simple, array, function and structure) are 2863 /// generated below. Their names are defined and used in OpenMPKinds.def. Here 2864 /// we provide the declarations, the initializeTypes function will provide the 2865 /// values. 2866 /// 2867 ///{ 2868 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr; 2869 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 2870 ArrayType *VarName##Ty = nullptr; \ 2871 PointerType *VarName##PtrTy = nullptr; 2872 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 2873 FunctionType *VarName = nullptr; \ 2874 PointerType *VarName##Ptr = nullptr; 2875 #define OMP_STRUCT_TYPE(VarName, StrName, ...) \ 2876 StructType *VarName = nullptr; \ 2877 PointerType *VarName##Ptr = nullptr; 2878 #include "llvm/Frontend/OpenMP/OMPKinds.def" 2879 2880 ///} 2881 2882 private: 2883 /// Create all simple and struct types exposed by the runtime and remember 2884 /// the llvm::PointerTypes of them for easy access later. 2885 void initializeTypes(Module &M); 2886 2887 /// Common interface for generating entry calls for OMP Directives. 2888 /// if the directive has a region/body, It will set the insertion 2889 /// point to the body 2890 /// 2891 /// \param OMPD Directive to generate entry blocks for 2892 /// \param EntryCall Call to the entry OMP Runtime Function 2893 /// \param ExitBB block where the region ends. 2894 /// \param Conditional indicate if the entry call result will be used 2895 /// to evaluate a conditional of whether a thread will execute 2896 /// body code or not. 2897 /// 2898 /// \return The insertion position in exit block 2899 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall, 2900 BasicBlock *ExitBB, 2901 bool Conditional = false); 2902 2903 /// Common interface to finalize the region 2904 /// 2905 /// \param OMPD Directive to generate exiting code for 2906 /// \param FinIP Insertion point for emitting Finalization code and exit call 2907 /// \param ExitCall Call to the ending OMP Runtime Function 2908 /// \param HasFinalize indicate if the directive will require finalization 2909 /// and has a finalization callback in the stack that 2910 /// should be called. 2911 /// 2912 /// \return The insertion position in exit block 2913 InsertPointTy emitCommonDirectiveExit(omp::Directive OMPD, 2914 InsertPointTy FinIP, 2915 Instruction *ExitCall, 2916 bool HasFinalize = true); 2917 2918 /// Common Interface to generate OMP inlined regions 2919 /// 2920 /// \param OMPD Directive to generate inlined region for 2921 /// \param EntryCall Call to the entry OMP Runtime Function 2922 /// \param ExitCall Call to the ending OMP Runtime Function 2923 /// \param BodyGenCB Body code generation callback. 2924 /// \param FiniCB Finalization Callback. Will be called when finalizing region 2925 /// \param Conditional indicate if the entry call result will be used 2926 /// to evaluate a conditional of whether a thread will execute 2927 /// body code or not. 2928 /// \param HasFinalize indicate if the directive will require finalization 2929 /// and has a finalization callback in the stack that 2930 /// should be called. 2931 /// \param IsCancellable if HasFinalize is set to true, indicate if the 2932 /// the directive should be cancellable. 2933 /// \return The insertion point after the region 2934 2935 InsertPointTy 2936 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall, 2937 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB, 2938 FinalizeCallbackTy FiniCB, bool Conditional = false, 2939 bool HasFinalize = true, bool IsCancellable = false); 2940 2941 /// Get the platform-specific name separator. 2942 /// \param Parts different parts of the final name that needs separation 2943 /// \param FirstSeparator First separator used between the initial two 2944 /// parts of the name. 2945 /// \param Separator separator used between all of the rest consecutive 2946 /// parts of the name 2947 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts, 2948 StringRef FirstSeparator, 2949 StringRef Separator); 2950 2951 /// Returns corresponding lock object for the specified critical region 2952 /// name. If the lock object does not exist it is created, otherwise the 2953 /// reference to the existing copy is returned. 2954 /// \param CriticalName Name of the critical region. 2955 /// 2956 Value *getOMPCriticalRegionLock(StringRef CriticalName); 2957 2958 /// Callback type for Atomic Expression update 2959 /// ex: 2960 /// \code{.cpp} 2961 /// unsigned x = 0; 2962 /// #pragma omp atomic update 2963 /// x = Expr(x_old); //Expr() is any legal operation 2964 /// \endcode 2965 /// 2966 /// \param XOld the value of the atomic memory address to use for update 2967 /// \param IRB reference to the IRBuilder to use 2968 /// 2969 /// \returns Value to update X to. 2970 using AtomicUpdateCallbackTy = 2971 const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>; 2972 2973 private: 2974 enum AtomicKind { Read, Write, Update, Capture, Compare }; 2975 2976 /// Determine whether to emit flush or not 2977 /// 2978 /// \param Loc The insert and source location description. 2979 /// \param AO The required atomic ordering 2980 /// \param AK The OpenMP atomic operation kind used. 2981 /// 2982 /// \returns wether a flush was emitted or not 2983 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc, 2984 AtomicOrdering AO, AtomicKind AK); 2985 2986 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 2987 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 2988 /// Only Scalar data types. 2989 /// 2990 /// \param AllocaIP The insertion point to be used for alloca 2991 /// instructions. 2992 /// \param X The target atomic pointer to be updated 2993 /// \param XElemTy The element type of the atomic pointer. 2994 /// \param Expr The value to update X with. 2995 /// \param AO Atomic ordering of the generated atomic 2996 /// instructions. 2997 /// \param RMWOp The binary operation used for update. If 2998 /// operation is not supported by atomicRMW, 2999 /// or belong to {FADD, FSUB, BAD_BINOP}. 3000 /// Then a `cmpExch` based atomic will be generated. 3001 /// \param UpdateOp Code generator for complex expressions that cannot be 3002 /// expressed through atomicrmw instruction. 3003 /// \param VolatileX true if \a X volatile? 3004 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3005 /// update expression, false otherwise. 3006 /// (e.g. true for X = X BinOp Expr) 3007 /// 3008 /// \returns A pair of the old value of X before the update, and the value 3009 /// used for the update. 3010 std::pair<Value *, Value *> 3011 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 3012 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3013 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, 3014 bool IsXBinopExpr); 3015 3016 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 . 3017 /// 3018 /// \Return The instruction 3019 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2, 3020 AtomicRMWInst::BinOp RMWOp); 3021 3022 public: 3023 /// a struct to pack relevant information while generating atomic Ops 3024 struct AtomicOpValue { 3025 Value *Var = nullptr; 3026 Type *ElemTy = nullptr; 3027 bool IsSigned = false; 3028 bool IsVolatile = false; 3029 }; 3030 3031 /// Emit atomic Read for : V = X --- Only Scalar data types. 3032 /// 3033 /// \param Loc The insert and source location description. 3034 /// \param X The target pointer to be atomically read 3035 /// \param V Memory address where to store atomically read 3036 /// value 3037 /// \param AO Atomic ordering of the generated atomic 3038 /// instructions. 3039 /// 3040 /// \return Insertion point after generated atomic read IR. 3041 InsertPointTy createAtomicRead(const LocationDescription &Loc, 3042 AtomicOpValue &X, AtomicOpValue &V, 3043 AtomicOrdering AO); 3044 3045 /// Emit atomic write for : X = Expr --- Only Scalar data types. 3046 /// 3047 /// \param Loc The insert and source location description. 3048 /// \param X The target pointer to be atomically written to 3049 /// \param Expr The value to store. 3050 /// \param AO Atomic ordering of the generated atomic 3051 /// instructions. 3052 /// 3053 /// \return Insertion point after generated atomic Write IR. 3054 InsertPointTy createAtomicWrite(const LocationDescription &Loc, 3055 AtomicOpValue &X, Value *Expr, 3056 AtomicOrdering AO); 3057 3058 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X 3059 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X) 3060 /// Only Scalar data types. 3061 /// 3062 /// \param Loc The insert and source location description. 3063 /// \param AllocaIP The insertion point to be used for alloca instructions. 3064 /// \param X The target atomic pointer to be updated 3065 /// \param Expr The value to update X with. 3066 /// \param AO Atomic ordering of the generated atomic instructions. 3067 /// \param RMWOp The binary operation used for update. If operation 3068 /// is not supported by atomicRMW, or belong to 3069 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based 3070 /// atomic will be generated. 3071 /// \param UpdateOp Code generator for complex expressions that cannot be 3072 /// expressed through atomicrmw instruction. 3073 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the 3074 /// update expression, false otherwise. 3075 /// (e.g. true for X = X BinOp Expr) 3076 /// 3077 /// \return Insertion point after generated atomic update IR. 3078 InsertPointTy createAtomicUpdate(const LocationDescription &Loc, 3079 InsertPointTy AllocaIP, AtomicOpValue &X, 3080 Value *Expr, AtomicOrdering AO, 3081 AtomicRMWInst::BinOp RMWOp, 3082 AtomicUpdateCallbackTy &UpdateOp, 3083 bool IsXBinopExpr); 3084 3085 /// Emit atomic update for constructs: --- Only Scalar data types 3086 /// V = X; X = X BinOp Expr , 3087 /// X = X BinOp Expr; V = X, 3088 /// V = X; X = Expr BinOp X, 3089 /// X = Expr BinOp X; V = X, 3090 /// V = X; X = UpdateOp(X), 3091 /// X = UpdateOp(X); V = X, 3092 /// 3093 /// \param Loc The insert and source location description. 3094 /// \param AllocaIP The insertion point to be used for alloca instructions. 3095 /// \param X The target atomic pointer to be updated 3096 /// \param V Memory address where to store captured value 3097 /// \param Expr The value to update X with. 3098 /// \param AO Atomic ordering of the generated atomic instructions 3099 /// \param RMWOp The binary operation used for update. If 3100 /// operation is not supported by atomicRMW, or belong to 3101 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based 3102 /// atomic will be generated. 3103 /// \param UpdateOp Code generator for complex expressions that cannot be 3104 /// expressed through atomicrmw instruction. 3105 /// \param UpdateExpr true if X is an in place update of the form 3106 /// X = X BinOp Expr or X = Expr BinOp X 3107 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the 3108 /// update expression, false otherwise. 3109 /// (e.g. true for X = X BinOp Expr) 3110 /// \param IsPostfixUpdate true if original value of 'x' must be stored in 3111 /// 'v', not an updated one. 3112 /// 3113 /// \return Insertion point after generated atomic capture IR. 3114 InsertPointTy 3115 createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, 3116 AtomicOpValue &X, AtomicOpValue &V, Value *Expr, 3117 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 3118 AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, 3119 bool IsPostfixUpdate, bool IsXBinopExpr); 3120 3121 /// Emit atomic compare for constructs: --- Only scalar data types 3122 /// cond-expr-stmt: 3123 /// x = x ordop expr ? expr : x; 3124 /// x = expr ordop x ? expr : x; 3125 /// x = x == e ? d : x; 3126 /// x = e == x ? d : x; (this one is not in the spec) 3127 /// cond-update-stmt: 3128 /// if (x ordop expr) { x = expr; } 3129 /// if (expr ordop x) { x = expr; } 3130 /// if (x == e) { x = d; } 3131 /// if (e == x) { x = d; } (this one is not in the spec) 3132 /// conditional-update-capture-atomic: 3133 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false) 3134 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false) 3135 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3136 /// IsFailOnly=true) 3137 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false) 3138 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false, 3139 /// IsFailOnly=true) 3140 /// 3141 /// \param Loc The insert and source location description. 3142 /// \param X The target atomic pointer to be updated. 3143 /// \param V Memory address where to store captured value (for 3144 /// compare capture only). 3145 /// \param R Memory address where to store comparison result 3146 /// (for compare capture with '==' only). 3147 /// \param E The expected value ('e') for forms that use an 3148 /// equality comparison or an expression ('expr') for 3149 /// forms that use 'ordop' (logically an atomic maximum or 3150 /// minimum). 3151 /// \param D The desired value for forms that use an equality 3152 /// comparison. If forms that use 'ordop', it should be 3153 /// \p nullptr. 3154 /// \param AO Atomic ordering of the generated atomic instructions. 3155 /// \param Op Atomic compare operation. It can only be ==, <, or >. 3156 /// \param IsXBinopExpr True if the conditional statement is in the form where 3157 /// x is on LHS. It only matters for < or >. 3158 /// \param IsPostfixUpdate True if original value of 'x' must be stored in 3159 /// 'v', not an updated one (for compare capture 3160 /// only). 3161 /// \param IsFailOnly True if the original value of 'x' is stored to 'v' 3162 /// only when the comparison fails. This is only valid for 3163 /// the case the comparison is '=='. 3164 /// 3165 /// \return Insertion point after generated atomic capture IR. 3166 InsertPointTy 3167 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, 3168 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, 3169 AtomicOrdering AO, omp::OMPAtomicCompareOp Op, 3170 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly); 3171 InsertPointTy createAtomicCompare(const LocationDescription &Loc, 3172 AtomicOpValue &X, AtomicOpValue &V, 3173 AtomicOpValue &R, Value *E, Value *D, 3174 AtomicOrdering AO, 3175 omp::OMPAtomicCompareOp Op, 3176 bool IsXBinopExpr, bool IsPostfixUpdate, 3177 bool IsFailOnly, AtomicOrdering Failure); 3178 3179 /// Create the control flow structure of a canonical OpenMP loop. 3180 /// 3181 /// The emitted loop will be disconnected, i.e. no edge to the loop's 3182 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's 3183 /// IRBuilder location is not preserved. 3184 /// 3185 /// \param DL DebugLoc used for the instructions in the skeleton. 3186 /// \param TripCount Value to be used for the trip count. 3187 /// \param F Function in which to insert the BasicBlocks. 3188 /// \param PreInsertBefore Where to insert BBs that execute before the body, 3189 /// typically the body itself. 3190 /// \param PostInsertBefore Where to insert BBs that execute after the body. 3191 /// \param Name Base name used to derive BB 3192 /// and instruction names. 3193 /// 3194 /// \returns The CanonicalLoopInfo that represents the emitted loop. 3195 CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount, 3196 Function *F, 3197 BasicBlock *PreInsertBefore, 3198 BasicBlock *PostInsertBefore, 3199 const Twine &Name = {}); 3200 /// OMP Offload Info Metadata name string 3201 const std::string ompOffloadInfoName = "omp_offload.info"; 3202 3203 /// Loads all the offload entries information from the host IR 3204 /// metadata. This function is only meant to be used with device code 3205 /// generation. 3206 /// 3207 /// \param M Module to load Metadata info from. Module passed maybe 3208 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module. 3209 void loadOffloadInfoMetadata(Module &M); 3210 3211 /// Loads all the offload entries information from the host IR 3212 /// metadata read from the file passed in as the HostFilePath argument. This 3213 /// function is only meant to be used with device code generation. 3214 /// 3215 /// \param HostFilePath The path to the host IR file, 3216 /// used to load in offload metadata for the device, allowing host and device 3217 /// to maintain the same metadata mapping. 3218 void loadOffloadInfoMetadata(StringRef HostFilePath); 3219 3220 /// Gets (if variable with the given name already exist) or creates 3221 /// internal global variable with the specified Name. The created variable has 3222 /// linkage CommonLinkage by default and is initialized by null value. 3223 /// \param Ty Type of the global variable. If it is exist already the type 3224 /// must be the same. 3225 /// \param Name Name of the variable. 3226 GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name, 3227 unsigned AddressSpace = 0); 3228 }; 3229 3230 /// Class to represented the control flow structure of an OpenMP canonical loop. 3231 /// 3232 /// The control-flow structure is standardized for easy consumption by 3233 /// directives associated with loops. For instance, the worksharing-loop 3234 /// construct may change this control flow such that each loop iteration is 3235 /// executed on only one thread. The constraints of a canonical loop in brief 3236 /// are: 3237 /// 3238 /// * The number of loop iterations must have been computed before entering the 3239 /// loop. 3240 /// 3241 /// * Has an (unsigned) logical induction variable that starts at zero and 3242 /// increments by one. 3243 /// 3244 /// * The loop's CFG itself has no side-effects. The OpenMP specification 3245 /// itself allows side-effects, but the order in which they happen, including 3246 /// how often or whether at all, is unspecified. We expect that the frontend 3247 /// will emit those side-effect instructions somewhere (e.g. before the loop) 3248 /// such that the CanonicalLoopInfo itself can be side-effect free. 3249 /// 3250 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated 3251 /// execution of a loop body that satifies these constraints. It does NOT 3252 /// represent arbitrary SESE regions that happen to contain a loop. Do not use 3253 /// CanonicalLoopInfo for such purposes. 3254 /// 3255 /// The control flow can be described as follows: 3256 /// 3257 /// Preheader 3258 /// | 3259 /// /-> Header 3260 /// | | 3261 /// | Cond---\ 3262 /// | | | 3263 /// | Body | 3264 /// | | | | 3265 /// | <...> | 3266 /// | | | | 3267 /// \--Latch | 3268 /// | 3269 /// Exit 3270 /// | 3271 /// After 3272 /// 3273 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator, 3274 /// including) and end at AfterIP (at the After's first instruction, excluding). 3275 /// That is, instructions in the Preheader and After blocks (except the 3276 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have 3277 /// side-effects. Typically, the Preheader is used to compute the loop's trip 3278 /// count. The instructions from BodyIP (at the Body block's first instruction, 3279 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's 3280 /// control and thus can have side-effects. The body block is the single entry 3281 /// point into the loop body, which may contain arbitrary control flow as long 3282 /// as all control paths eventually branch to the Latch block. 3283 /// 3284 /// TODO: Consider adding another standardized BasicBlock between Body CFG and 3285 /// Latch to guarantee that there is only a single edge to the latch. It would 3286 /// make loop transformations easier to not needing to consider multiple 3287 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us 3288 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that 3289 /// executes after each body iteration. 3290 /// 3291 /// There must be no loop-carried dependencies through llvm::Values. This is 3292 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is 3293 /// for the induction variable. 3294 /// 3295 /// All code in Header, Cond, Latch and Exit (plus the terminator of the 3296 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked 3297 /// by assertOK(). They are expected to not be modified unless explicitly 3298 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP 3299 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop, 3300 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its 3301 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used 3302 /// anymore as its underlying control flow may not exist anymore. 3303 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop 3304 /// may also return a new CanonicalLoopInfo that can be passed to other 3305 /// loop-associated construct implementing methods. These loop-transforming 3306 /// methods may either create a new CanonicalLoopInfo usually using 3307 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and 3308 /// modify one of the input CanonicalLoopInfo and return it as representing the 3309 /// modified loop. What is done is an implementation detail of 3310 /// transformation-implementing method and callers should always assume that the 3311 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned. 3312 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one 3313 /// created by createCanonicalLoop, such that transforming methods do not have 3314 /// to special case where the CanonicalLoopInfo originated from. 3315 /// 3316 /// Generally, methods consuming CanonicalLoopInfo do not need an 3317 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the 3318 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless 3319 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate 3320 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically, 3321 /// any InsertPoint in the Preheader, After or Block can still be used after 3322 /// calling such a method. 3323 /// 3324 /// TODO: Provide mechanisms for exception handling and cancellation points. 3325 /// 3326 /// Defined outside OpenMPIRBuilder because nested classes cannot be 3327 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h. 3328 class CanonicalLoopInfo { 3329 friend class OpenMPIRBuilder; 3330 3331 private: 3332 BasicBlock *Header = nullptr; 3333 BasicBlock *Cond = nullptr; 3334 BasicBlock *Latch = nullptr; 3335 BasicBlock *Exit = nullptr; 3336 3337 /// Add the control blocks of this loop to \p BBs. 3338 /// 3339 /// This does not include any block from the body, including the one returned 3340 /// by getBody(). 3341 /// 3342 /// FIXME: This currently includes the Preheader and After blocks even though 3343 /// their content is (mostly) not under CanonicalLoopInfo's control. 3344 /// Re-evaluated whether this makes sense. 3345 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs); 3346 3347 /// Sets the number of loop iterations to the given value. This value must be 3348 /// valid in the condition block (i.e., defined in the preheader) and is 3349 /// interpreted as an unsigned integer. 3350 void setTripCount(Value *TripCount); 3351 3352 /// Replace all uses of the canonical induction variable in the loop body with 3353 /// a new one. 3354 /// 3355 /// The intended use case is to update the induction variable for an updated 3356 /// iteration space such that it can stay normalized in the 0...tripcount-1 3357 /// range. 3358 /// 3359 /// The \p Updater is called with the (presumable updated) current normalized 3360 /// induction variable and is expected to return the value that uses of the 3361 /// pre-updated induction values should use instead, typically dependent on 3362 /// the new induction variable. This is a lambda (instead of e.g. just passing 3363 /// the new value) to be able to distinguish the uses of the pre-updated 3364 /// induction variable and uses of the induction varible to compute the 3365 /// updated induction variable value. 3366 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater); 3367 3368 public: 3369 /// Returns whether this object currently represents the IR of a loop. If 3370 /// returning false, it may have been consumed by a loop transformation or not 3371 /// been intialized. Do not use in this case; isValid()3372 bool isValid() const { return Header; } 3373 3374 /// The preheader ensures that there is only a single edge entering the loop. 3375 /// Code that must be execute before any loop iteration can be emitted here, 3376 /// such as computing the loop trip count and begin lifetime markers. Code in 3377 /// the preheader is not considered part of the canonical loop. 3378 BasicBlock *getPreheader() const; 3379 3380 /// The header is the entry for each iteration. In the canonical control flow, 3381 /// it only contains the PHINode for the induction variable. getHeader()3382 BasicBlock *getHeader() const { 3383 assert(isValid() && "Requires a valid canonical loop"); 3384 return Header; 3385 } 3386 3387 /// The condition block computes whether there is another loop iteration. If 3388 /// yes, branches to the body; otherwise to the exit block. getCond()3389 BasicBlock *getCond() const { 3390 assert(isValid() && "Requires a valid canonical loop"); 3391 return Cond; 3392 } 3393 3394 /// The body block is the single entry for a loop iteration and not controlled 3395 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must 3396 /// eventually branch to the \p Latch block. getBody()3397 BasicBlock *getBody() const { 3398 assert(isValid() && "Requires a valid canonical loop"); 3399 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0); 3400 } 3401 3402 /// Reaching the latch indicates the end of the loop body code. In the 3403 /// canonical control flow, it only contains the increment of the induction 3404 /// variable. getLatch()3405 BasicBlock *getLatch() const { 3406 assert(isValid() && "Requires a valid canonical loop"); 3407 return Latch; 3408 } 3409 3410 /// Reaching the exit indicates no more iterations are being executed. getExit()3411 BasicBlock *getExit() const { 3412 assert(isValid() && "Requires a valid canonical loop"); 3413 return Exit; 3414 } 3415 3416 /// The after block is intended for clean-up code such as lifetime end 3417 /// markers. It is separate from the exit block to ensure, analogous to the 3418 /// preheader, it having just a single entry edge and being free from PHI 3419 /// nodes should there be multiple loop exits (such as from break 3420 /// statements/cancellations). getAfter()3421 BasicBlock *getAfter() const { 3422 assert(isValid() && "Requires a valid canonical loop"); 3423 return Exit->getSingleSuccessor(); 3424 } 3425 3426 /// Returns the llvm::Value containing the number of loop iterations. It must 3427 /// be valid in the preheader and always interpreted as an unsigned integer of 3428 /// any bit-width. getTripCount()3429 Value *getTripCount() const { 3430 assert(isValid() && "Requires a valid canonical loop"); 3431 Instruction *CmpI = &Cond->front(); 3432 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 3433 return CmpI->getOperand(1); 3434 } 3435 3436 /// Returns the instruction representing the current logical induction 3437 /// variable. Always unsigned, always starting at 0 with an increment of one. getIndVar()3438 Instruction *getIndVar() const { 3439 assert(isValid() && "Requires a valid canonical loop"); 3440 Instruction *IndVarPHI = &Header->front(); 3441 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI"); 3442 return IndVarPHI; 3443 } 3444 3445 /// Return the type of the induction variable (and the trip count). getIndVarType()3446 Type *getIndVarType() const { 3447 assert(isValid() && "Requires a valid canonical loop"); 3448 return getIndVar()->getType(); 3449 } 3450 3451 /// Return the insertion point for user code before the loop. getPreheaderIP()3452 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const { 3453 assert(isValid() && "Requires a valid canonical loop"); 3454 BasicBlock *Preheader = getPreheader(); 3455 return {Preheader, std::prev(Preheader->end())}; 3456 }; 3457 3458 /// Return the insertion point for user code in the body. getBodyIP()3459 OpenMPIRBuilder::InsertPointTy getBodyIP() const { 3460 assert(isValid() && "Requires a valid canonical loop"); 3461 BasicBlock *Body = getBody(); 3462 return {Body, Body->begin()}; 3463 }; 3464 3465 /// Return the insertion point for user code after the loop. getAfterIP()3466 OpenMPIRBuilder::InsertPointTy getAfterIP() const { 3467 assert(isValid() && "Requires a valid canonical loop"); 3468 BasicBlock *After = getAfter(); 3469 return {After, After->begin()}; 3470 }; 3471 getFunction()3472 Function *getFunction() const { 3473 assert(isValid() && "Requires a valid canonical loop"); 3474 return Header->getParent(); 3475 } 3476 3477 /// Consistency self-check. 3478 void assertOK() const; 3479 3480 /// Invalidate this loop. That is, the underlying IR does not fulfill the 3481 /// requirements of an OpenMP canonical loop anymore. 3482 void invalidate(); 3483 }; 3484 3485 } // end namespace llvm 3486 3487 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H 3488