1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file implements the OpenMPIRBuilder class, which is used as a 11 /// convenient way to create LLVM instructions for OpenMP directives. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" 16 #include "llvm/ADT/SmallSet.h" 17 #include "llvm/ADT/StringExtras.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/CodeMetrics.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ScalarEvolution.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Bitcode/BitcodeReader.h" 26 #include "llvm/Frontend/Offloading/Utility.h" 27 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 28 #include "llvm/IR/Attributes.h" 29 #include "llvm/IR/BasicBlock.h" 30 #include "llvm/IR/CFG.h" 31 #include "llvm/IR/CallingConv.h" 32 #include "llvm/IR/Constant.h" 33 #include "llvm/IR/Constants.h" 34 #include "llvm/IR/DebugInfoMetadata.h" 35 #include "llvm/IR/DerivedTypes.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/GlobalVariable.h" 38 #include "llvm/IR/IRBuilder.h" 39 #include "llvm/IR/LLVMContext.h" 40 #include "llvm/IR/MDBuilder.h" 41 #include "llvm/IR/Metadata.h" 42 #include "llvm/IR/PassManager.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/MC/TargetRegistry.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/FileSystem.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include "llvm/Target/TargetOptions.h" 50 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 51 #include "llvm/Transforms/Utils/Cloning.h" 52 #include "llvm/Transforms/Utils/CodeExtractor.h" 53 #include "llvm/Transforms/Utils/LoopPeel.h" 54 #include "llvm/Transforms/Utils/UnrollLoop.h" 55 56 #include <cstdint> 57 #include <optional> 58 59 #define DEBUG_TYPE "openmp-ir-builder" 60 61 using namespace llvm; 62 using namespace omp; 63 64 static cl::opt<bool> 65 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, 66 cl::desc("Use optimistic attributes describing " 67 "'as-if' properties of runtime calls."), 68 cl::init(false)); 69 70 static cl::opt<double> UnrollThresholdFactor( 71 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden, 72 cl::desc("Factor for the unroll threshold to account for code " 73 "simplifications still taking place"), 74 cl::init(1.5)); 75 76 #ifndef NDEBUG 77 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions 78 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because 79 /// an InsertPoint stores the instruction before something is inserted. For 80 /// instance, if both point to the same instruction, two IRBuilders alternating 81 /// creating instruction will cause the instructions to be interleaved. 82 static bool isConflictIP(IRBuilder<>::InsertPoint IP1, 83 IRBuilder<>::InsertPoint IP2) { 84 if (!IP1.isSet() || !IP2.isSet()) 85 return false; 86 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint(); 87 } 88 89 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { 90 // Valid ordered/unordered and base algorithm combinations. 91 switch (SchedType & ~OMPScheduleType::MonotonicityMask) { 92 case OMPScheduleType::UnorderedStaticChunked: 93 case OMPScheduleType::UnorderedStatic: 94 case OMPScheduleType::UnorderedDynamicChunked: 95 case OMPScheduleType::UnorderedGuidedChunked: 96 case OMPScheduleType::UnorderedRuntime: 97 case OMPScheduleType::UnorderedAuto: 98 case OMPScheduleType::UnorderedTrapezoidal: 99 case OMPScheduleType::UnorderedGreedy: 100 case OMPScheduleType::UnorderedBalanced: 101 case OMPScheduleType::UnorderedGuidedIterativeChunked: 102 case OMPScheduleType::UnorderedGuidedAnalyticalChunked: 103 case OMPScheduleType::UnorderedSteal: 104 case OMPScheduleType::UnorderedStaticBalancedChunked: 105 case OMPScheduleType::UnorderedGuidedSimd: 106 case OMPScheduleType::UnorderedRuntimeSimd: 107 case OMPScheduleType::OrderedStaticChunked: 108 case OMPScheduleType::OrderedStatic: 109 case OMPScheduleType::OrderedDynamicChunked: 110 case OMPScheduleType::OrderedGuidedChunked: 111 case OMPScheduleType::OrderedRuntime: 112 case OMPScheduleType::OrderedAuto: 113 case OMPScheduleType::OrderdTrapezoidal: 114 case OMPScheduleType::NomergeUnorderedStaticChunked: 115 case OMPScheduleType::NomergeUnorderedStatic: 116 case OMPScheduleType::NomergeUnorderedDynamicChunked: 117 case OMPScheduleType::NomergeUnorderedGuidedChunked: 118 case OMPScheduleType::NomergeUnorderedRuntime: 119 case OMPScheduleType::NomergeUnorderedAuto: 120 case OMPScheduleType::NomergeUnorderedTrapezoidal: 121 case OMPScheduleType::NomergeUnorderedGreedy: 122 case OMPScheduleType::NomergeUnorderedBalanced: 123 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked: 124 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked: 125 case OMPScheduleType::NomergeUnorderedSteal: 126 case OMPScheduleType::NomergeOrderedStaticChunked: 127 case OMPScheduleType::NomergeOrderedStatic: 128 case OMPScheduleType::NomergeOrderedDynamicChunked: 129 case OMPScheduleType::NomergeOrderedGuidedChunked: 130 case OMPScheduleType::NomergeOrderedRuntime: 131 case OMPScheduleType::NomergeOrderedAuto: 132 case OMPScheduleType::NomergeOrderedTrapezoidal: 133 break; 134 default: 135 return false; 136 } 137 138 // Must not set both monotonicity modifiers at the same time. 139 OMPScheduleType MonotonicityFlags = 140 SchedType & OMPScheduleType::MonotonicityMask; 141 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask) 142 return false; 143 144 return true; 145 } 146 #endif 147 148 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) { 149 if (T.isAMDGPU()) { 150 StringRef Features = 151 Kernel->getFnAttribute("target-features").getValueAsString(); 152 if (Features.count("+wavefrontsize64")) 153 return omp::getAMDGPUGridValues<64>(); 154 return omp::getAMDGPUGridValues<32>(); 155 } 156 if (T.isNVPTX()) 157 return omp::NVPTXGridValues; 158 llvm_unreachable("No grid value available for this architecture!"); 159 } 160 161 /// Determine which scheduling algorithm to use, determined from schedule clause 162 /// arguments. 163 static OMPScheduleType 164 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, 165 bool HasSimdModifier) { 166 // Currently, the default schedule it static. 167 switch (ClauseKind) { 168 case OMP_SCHEDULE_Default: 169 case OMP_SCHEDULE_Static: 170 return HasChunks ? OMPScheduleType::BaseStaticChunked 171 : OMPScheduleType::BaseStatic; 172 case OMP_SCHEDULE_Dynamic: 173 return OMPScheduleType::BaseDynamicChunked; 174 case OMP_SCHEDULE_Guided: 175 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd 176 : OMPScheduleType::BaseGuidedChunked; 177 case OMP_SCHEDULE_Auto: 178 return llvm::omp::OMPScheduleType::BaseAuto; 179 case OMP_SCHEDULE_Runtime: 180 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd 181 : OMPScheduleType::BaseRuntime; 182 } 183 llvm_unreachable("unhandled schedule clause argument"); 184 } 185 186 /// Adds ordering modifier flags to schedule type. 187 static OMPScheduleType 188 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, 189 bool HasOrderedClause) { 190 assert((BaseScheduleType & OMPScheduleType::ModifierMask) == 191 OMPScheduleType::None && 192 "Must not have ordering nor monotonicity flags already set"); 193 194 OMPScheduleType OrderingModifier = HasOrderedClause 195 ? OMPScheduleType::ModifierOrdered 196 : OMPScheduleType::ModifierUnordered; 197 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier; 198 199 // Unsupported combinations 200 if (OrderingScheduleType == 201 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered)) 202 return OMPScheduleType::OrderedGuidedChunked; 203 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd | 204 OMPScheduleType::ModifierOrdered)) 205 return OMPScheduleType::OrderedRuntime; 206 207 return OrderingScheduleType; 208 } 209 210 /// Adds monotonicity modifier flags to schedule type. 211 static OMPScheduleType 212 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, 213 bool HasSimdModifier, bool HasMonotonic, 214 bool HasNonmonotonic, bool HasOrderedClause) { 215 assert((ScheduleType & OMPScheduleType::MonotonicityMask) == 216 OMPScheduleType::None && 217 "Must not have monotonicity flags already set"); 218 assert((!HasMonotonic || !HasNonmonotonic) && 219 "Monotonic and Nonmonotonic are contradicting each other"); 220 221 if (HasMonotonic) { 222 return ScheduleType | OMPScheduleType::ModifierMonotonic; 223 } else if (HasNonmonotonic) { 224 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 225 } else { 226 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description. 227 // If the static schedule kind is specified or if the ordered clause is 228 // specified, and if the nonmonotonic modifier is not specified, the 229 // effect is as if the monotonic modifier is specified. Otherwise, unless 230 // the monotonic modifier is specified, the effect is as if the 231 // nonmonotonic modifier is specified. 232 OMPScheduleType BaseScheduleType = 233 ScheduleType & ~OMPScheduleType::ModifierMask; 234 if ((BaseScheduleType == OMPScheduleType::BaseStatic) || 235 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) || 236 HasOrderedClause) { 237 // The monotonic is used by default in openmp runtime library, so no need 238 // to set it. 239 return ScheduleType; 240 } else { 241 return ScheduleType | OMPScheduleType::ModifierNonmonotonic; 242 } 243 } 244 } 245 246 /// Determine the schedule type using schedule and ordering clause arguments. 247 static OMPScheduleType 248 computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, 249 bool HasSimdModifier, bool HasMonotonicModifier, 250 bool HasNonmonotonicModifier, bool HasOrderedClause) { 251 OMPScheduleType BaseSchedule = 252 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier); 253 OMPScheduleType OrderedSchedule = 254 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause); 255 OMPScheduleType Result = getOpenMPMonotonicityScheduleType( 256 OrderedSchedule, HasSimdModifier, HasMonotonicModifier, 257 HasNonmonotonicModifier, HasOrderedClause); 258 259 assert(isValidWorkshareLoopScheduleType(Result)); 260 return Result; 261 } 262 263 /// Make \p Source branch to \p Target. 264 /// 265 /// Handles two situations: 266 /// * \p Source already has an unconditional branch. 267 /// * \p Source is a degenerate block (no terminator because the BB is 268 /// the current head of the IR construction). 269 static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL) { 270 if (Instruction *Term = Source->getTerminator()) { 271 auto *Br = cast<BranchInst>(Term); 272 assert(!Br->isConditional() && 273 "BB's terminator must be an unconditional branch (or degenerate)"); 274 BasicBlock *Succ = Br->getSuccessor(0); 275 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true); 276 Br->setSuccessor(0, Target); 277 return; 278 } 279 280 auto *NewBr = BranchInst::Create(Target, Source); 281 NewBr->setDebugLoc(DL); 282 } 283 284 void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, 285 bool CreateBranch) { 286 assert(New->getFirstInsertionPt() == New->begin() && 287 "Target BB must not have PHI nodes"); 288 289 // Move instructions to new block. 290 BasicBlock *Old = IP.getBlock(); 291 New->splice(New->begin(), Old, IP.getPoint(), Old->end()); 292 293 if (CreateBranch) 294 BranchInst::Create(New, Old); 295 } 296 297 void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) { 298 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 299 BasicBlock *Old = Builder.GetInsertBlock(); 300 301 spliceBB(Builder.saveIP(), New, CreateBranch); 302 if (CreateBranch) 303 Builder.SetInsertPoint(Old->getTerminator()); 304 else 305 Builder.SetInsertPoint(Old); 306 307 // SetInsertPoint also updates the Builder's debug location, but we want to 308 // keep the one the Builder was configured to use. 309 Builder.SetCurrentDebugLocation(DebugLoc); 310 } 311 312 BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, 313 llvm::Twine Name) { 314 BasicBlock *Old = IP.getBlock(); 315 BasicBlock *New = BasicBlock::Create( 316 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name, 317 Old->getParent(), Old->getNextNode()); 318 spliceBB(IP, New, CreateBranch); 319 New->replaceSuccessorsPhiUsesWith(Old, New); 320 return New; 321 } 322 323 BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch, 324 llvm::Twine Name) { 325 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 326 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 327 if (CreateBranch) 328 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 329 else 330 Builder.SetInsertPoint(Builder.GetInsertBlock()); 331 // SetInsertPoint also updates the Builder's debug location, but we want to 332 // keep the one the Builder was configured to use. 333 Builder.SetCurrentDebugLocation(DebugLoc); 334 return New; 335 } 336 337 BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch, 338 llvm::Twine Name) { 339 DebugLoc DebugLoc = Builder.getCurrentDebugLocation(); 340 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name); 341 if (CreateBranch) 342 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator()); 343 else 344 Builder.SetInsertPoint(Builder.GetInsertBlock()); 345 // SetInsertPoint also updates the Builder's debug location, but we want to 346 // keep the one the Builder was configured to use. 347 Builder.SetCurrentDebugLocation(DebugLoc); 348 return New; 349 } 350 351 BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, 352 llvm::Twine Suffix) { 353 BasicBlock *Old = Builder.GetInsertBlock(); 354 return splitBB(Builder, CreateBranch, Old->getName() + Suffix); 355 } 356 357 // This function creates a fake integer value and a fake use for the integer 358 // value. It returns the fake value created. This is useful in modeling the 359 // extra arguments to the outlined functions. 360 Value *createFakeIntVal(IRBuilder<> &Builder, 361 OpenMPIRBuilder::InsertPointTy OuterAllocaIP, 362 std::stack<Instruction *> &ToBeDeleted, 363 OpenMPIRBuilder::InsertPointTy InnerAllocaIP, 364 const Twine &Name = "", bool AsPtr = true) { 365 Builder.restoreIP(OuterAllocaIP); 366 Instruction *FakeVal; 367 AllocaInst *FakeValAddr = 368 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr"); 369 ToBeDeleted.push(FakeValAddr); 370 371 if (AsPtr) { 372 FakeVal = FakeValAddr; 373 } else { 374 FakeVal = 375 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val"); 376 ToBeDeleted.push(FakeVal); 377 } 378 379 // Generate a fake use of this value 380 Builder.restoreIP(InnerAllocaIP); 381 Instruction *UseFakeVal; 382 if (AsPtr) { 383 UseFakeVal = 384 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use"); 385 } else { 386 UseFakeVal = 387 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10))); 388 } 389 ToBeDeleted.push(UseFakeVal); 390 return FakeVal; 391 } 392 393 //===----------------------------------------------------------------------===// 394 // OpenMPIRBuilderConfig 395 //===----------------------------------------------------------------------===// 396 397 namespace { 398 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); 399 /// Values for bit flags for marking which requires clauses have been used. 400 enum OpenMPOffloadingRequiresDirFlags { 401 /// flag undefined. 402 OMP_REQ_UNDEFINED = 0x000, 403 /// no requires directive present. 404 OMP_REQ_NONE = 0x001, 405 /// reverse_offload clause. 406 OMP_REQ_REVERSE_OFFLOAD = 0x002, 407 /// unified_address clause. 408 OMP_REQ_UNIFIED_ADDRESS = 0x004, 409 /// unified_shared_memory clause. 410 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, 411 /// dynamic_allocators clause. 412 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010, 413 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS) 414 }; 415 416 } // anonymous namespace 417 418 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig() 419 : RequiresFlags(OMP_REQ_UNDEFINED) {} 420 421 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig( 422 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory, 423 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress, 424 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators) 425 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU), 426 OpenMPOffloadMandatory(OpenMPOffloadMandatory), 427 RequiresFlags(OMP_REQ_UNDEFINED) { 428 if (HasRequiresReverseOffload) 429 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; 430 if (HasRequiresUnifiedAddress) 431 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; 432 if (HasRequiresUnifiedSharedMemory) 433 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; 434 if (HasRequiresDynamicAllocators) 435 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; 436 } 437 438 bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const { 439 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD; 440 } 441 442 bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const { 443 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS; 444 } 445 446 bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const { 447 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY; 448 } 449 450 bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const { 451 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS; 452 } 453 454 int64_t OpenMPIRBuilderConfig::getRequiresFlags() const { 455 return hasRequiresFlags() ? RequiresFlags 456 : static_cast<int64_t>(OMP_REQ_NONE); 457 } 458 459 void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) { 460 if (Value) 461 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD; 462 else 463 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD; 464 } 465 466 void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) { 467 if (Value) 468 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS; 469 else 470 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS; 471 } 472 473 void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) { 474 if (Value) 475 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY; 476 else 477 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY; 478 } 479 480 void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) { 481 if (Value) 482 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS; 483 else 484 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS; 485 } 486 487 //===----------------------------------------------------------------------===// 488 // OpenMPIRBuilder 489 //===----------------------------------------------------------------------===// 490 491 void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs, 492 IRBuilderBase &Builder, 493 SmallVector<Value *> &ArgsVector) { 494 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION); 495 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems); 496 auto Int32Ty = Type::getInt32Ty(Builder.getContext()); 497 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3)); 498 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait); 499 500 Value *NumTeams3D = 501 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0}); 502 Value *NumThreads3D = 503 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0}); 504 505 ArgsVector = {Version, 506 PointerNum, 507 KernelArgs.RTArgs.BasePointersArray, 508 KernelArgs.RTArgs.PointersArray, 509 KernelArgs.RTArgs.SizesArray, 510 KernelArgs.RTArgs.MapTypesArray, 511 KernelArgs.RTArgs.MapNamesArray, 512 KernelArgs.RTArgs.MappersArray, 513 KernelArgs.NumIterations, 514 Flags, 515 NumTeams3D, 516 NumThreads3D, 517 KernelArgs.DynCGGroupMem}; 518 } 519 520 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) { 521 LLVMContext &Ctx = Fn.getContext(); 522 523 // Get the function's current attributes. 524 auto Attrs = Fn.getAttributes(); 525 auto FnAttrs = Attrs.getFnAttrs(); 526 auto RetAttrs = Attrs.getRetAttrs(); 527 SmallVector<AttributeSet, 4> ArgAttrs; 528 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo) 529 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo)); 530 531 // Add AS to FnAS while taking special care with integer extensions. 532 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS, 533 bool Param = true) -> void { 534 bool HasSignExt = AS.hasAttribute(Attribute::SExt); 535 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt); 536 if (HasSignExt || HasZeroExt) { 537 assert(AS.getNumAttributes() == 1 && 538 "Currently not handling extension attr combined with others."); 539 if (Param) { 540 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt)) 541 FnAS = FnAS.addAttribute(Ctx, AK); 542 } else if (auto AK = 543 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt)) 544 FnAS = FnAS.addAttribute(Ctx, AK); 545 } else { 546 FnAS = FnAS.addAttributes(Ctx, AS); 547 } 548 }; 549 550 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet; 551 #include "llvm/Frontend/OpenMP/OMPKinds.def" 552 553 // Add attributes to the function declaration. 554 switch (FnID) { 555 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \ 556 case Enum: \ 557 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \ 558 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \ 559 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \ 560 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \ 561 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \ 562 break; 563 #include "llvm/Frontend/OpenMP/OMPKinds.def" 564 default: 565 // Attributes are optional. 566 break; 567 } 568 } 569 570 FunctionCallee 571 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) { 572 FunctionType *FnTy = nullptr; 573 Function *Fn = nullptr; 574 575 // Try to find the declation in the module first. 576 switch (FnID) { 577 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \ 578 case Enum: \ 579 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \ 580 IsVarArg); \ 581 Fn = M.getFunction(Str); \ 582 break; 583 #include "llvm/Frontend/OpenMP/OMPKinds.def" 584 } 585 586 if (!Fn) { 587 // Create a new declaration if we need one. 588 switch (FnID) { 589 #define OMP_RTL(Enum, Str, ...) \ 590 case Enum: \ 591 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \ 592 break; 593 #include "llvm/Frontend/OpenMP/OMPKinds.def" 594 } 595 596 // Add information if the runtime function takes a callback function 597 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) { 598 if (!Fn->hasMetadata(LLVMContext::MD_callback)) { 599 LLVMContext &Ctx = Fn->getContext(); 600 MDBuilder MDB(Ctx); 601 // Annotate the callback behavior of the runtime function: 602 // - The callback callee is argument number 2 (microtask). 603 // - The first two arguments of the callback callee are unknown (-1). 604 // - All variadic arguments to the runtime function are passed to the 605 // callback callee. 606 Fn->addMetadata( 607 LLVMContext::MD_callback, 608 *MDNode::get(Ctx, {MDB.createCallbackEncoding( 609 2, {-1, -1}, /* VarArgsArePassed */ true)})); 610 } 611 } 612 613 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName() 614 << " with type " << *Fn->getFunctionType() << "\n"); 615 addAttributes(FnID, *Fn); 616 617 } else { 618 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName() 619 << " with type " << *Fn->getFunctionType() << "\n"); 620 } 621 622 assert(Fn && "Failed to create OpenMP runtime function"); 623 624 return {FnTy, Fn}; 625 } 626 627 Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) { 628 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID); 629 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee()); 630 assert(Fn && "Failed to create OpenMP runtime function pointer"); 631 return Fn; 632 } 633 634 void OpenMPIRBuilder::initialize() { initializeTypes(M); } 635 636 void OpenMPIRBuilder::finalize(Function *Fn) { 637 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 638 SmallVector<BasicBlock *, 32> Blocks; 639 SmallVector<OutlineInfo, 16> DeferredOutlines; 640 for (OutlineInfo &OI : OutlineInfos) { 641 // Skip functions that have not finalized yet; may happen with nested 642 // function generation. 643 if (Fn && OI.getFunction() != Fn) { 644 DeferredOutlines.push_back(OI); 645 continue; 646 } 647 648 ParallelRegionBlockSet.clear(); 649 Blocks.clear(); 650 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 651 652 Function *OuterFn = OI.getFunction(); 653 CodeExtractorAnalysisCache CEAC(*OuterFn); 654 // If we generate code for the target device, we need to allocate 655 // struct for aggregate params in the device default alloca address space. 656 // OpenMP runtime requires that the params of the extracted functions are 657 // passed as zero address space pointers. This flag ensures that 658 // CodeExtractor generates correct code for extracted functions 659 // which are used by OpenMP runtime. 660 bool ArgsInZeroAddressSpace = Config.isTargetDevice(); 661 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 662 /* AggregateArgs */ true, 663 /* BlockFrequencyInfo */ nullptr, 664 /* BranchProbabilityInfo */ nullptr, 665 /* AssumptionCache */ nullptr, 666 /* AllowVarArgs */ true, 667 /* AllowAlloca */ true, 668 /* AllocaBlock*/ OI.OuterAllocaBB, 669 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); 670 671 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n"); 672 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName() 673 << " Exit: " << OI.ExitBB->getName() << "\n"); 674 assert(Extractor.isEligible() && 675 "Expected OpenMP outlining to be possible!"); 676 677 for (auto *V : OI.ExcludeArgsFromAggregate) 678 Extractor.excludeArgFromAggregate(V); 679 680 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); 681 682 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); 683 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); 684 assert(OutlinedFn->getReturnType()->isVoidTy() && 685 "OpenMP outlined functions should not return a value!"); 686 687 // For compability with the clang CG we move the outlined function after the 688 // one with the parallel region. 689 OutlinedFn->removeFromParent(); 690 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn); 691 692 // Remove the artificial entry introduced by the extractor right away, we 693 // made our own entry block after all. 694 { 695 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock(); 696 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB); 697 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry); 698 // Move instructions from the to-be-deleted ArtificialEntry to the entry 699 // basic block of the parallel region. CodeExtractor generates 700 // instructions to unwrap the aggregate argument and may sink 701 // allocas/bitcasts for values that are solely used in the outlined region 702 // and do not escape. 703 assert(!ArtificialEntry.empty() && 704 "Expected instructions to add in the outlined region entry"); 705 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(), 706 End = ArtificialEntry.rend(); 707 It != End;) { 708 Instruction &I = *It; 709 It++; 710 711 if (I.isTerminator()) 712 continue; 713 714 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt()); 715 } 716 717 OI.EntryBB->moveBefore(&ArtificialEntry); 718 ArtificialEntry.eraseFromParent(); 719 } 720 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB); 721 assert(OutlinedFn && OutlinedFn->getNumUses() == 1); 722 723 // Run a user callback, e.g. to add attributes. 724 if (OI.PostOutlineCB) 725 OI.PostOutlineCB(*OutlinedFn); 726 } 727 728 // Remove work items that have been completed. 729 OutlineInfos = std::move(DeferredOutlines); 730 731 EmitMetadataErrorReportFunctionTy &&ErrorReportFn = 732 [](EmitMetadataErrorKind Kind, 733 const TargetRegionEntryInfo &EntryInfo) -> void { 734 errs() << "Error of kind: " << Kind 735 << " when emitting offload entries and metadata during " 736 "OMPIRBuilder finalization \n"; 737 }; 738 739 if (!OffloadInfoManager.empty()) 740 createOffloadEntriesAndInfoMetadata(ErrorReportFn); 741 } 742 743 OpenMPIRBuilder::~OpenMPIRBuilder() { 744 assert(OutlineInfos.empty() && "There must be no outstanding outlinings"); 745 } 746 747 GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) { 748 IntegerType *I32Ty = Type::getInt32Ty(M.getContext()); 749 auto *GV = 750 new GlobalVariable(M, I32Ty, 751 /* isConstant = */ true, GlobalValue::WeakODRLinkage, 752 ConstantInt::get(I32Ty, Value), Name); 753 GV->setVisibility(GlobalValue::HiddenVisibility); 754 755 return GV; 756 } 757 758 Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, 759 uint32_t SrcLocStrSize, 760 IdentFlag LocFlags, 761 unsigned Reserve2Flags) { 762 // Enable "C-mode". 763 LocFlags |= OMP_IDENT_FLAG_KMPC; 764 765 Constant *&Ident = 766 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}]; 767 if (!Ident) { 768 Constant *I32Null = ConstantInt::getNullValue(Int32); 769 Constant *IdentData[] = {I32Null, 770 ConstantInt::get(Int32, uint32_t(LocFlags)), 771 ConstantInt::get(Int32, Reserve2Flags), 772 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr}; 773 Constant *Initializer = 774 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData); 775 776 // Look for existing encoding of the location + flags, not needed but 777 // minimizes the difference to the existing solution while we transition. 778 for (GlobalVariable &GV : M.globals()) 779 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer()) 780 if (GV.getInitializer() == Initializer) 781 Ident = &GV; 782 783 if (!Ident) { 784 auto *GV = new GlobalVariable( 785 M, OpenMPIRBuilder::Ident, 786 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "", 787 nullptr, GlobalValue::NotThreadLocal, 788 M.getDataLayout().getDefaultGlobalsAddressSpace()); 789 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 790 GV->setAlignment(Align(8)); 791 Ident = GV; 792 } 793 } 794 795 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr); 796 } 797 798 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr, 799 uint32_t &SrcLocStrSize) { 800 SrcLocStrSize = LocStr.size(); 801 Constant *&SrcLocStr = SrcLocStrMap[LocStr]; 802 if (!SrcLocStr) { 803 Constant *Initializer = 804 ConstantDataArray::getString(M.getContext(), LocStr); 805 806 // Look for existing encoding of the location, not needed but minimizes the 807 // difference to the existing solution while we transition. 808 for (GlobalVariable &GV : M.globals()) 809 if (GV.isConstant() && GV.hasInitializer() && 810 GV.getInitializer() == Initializer) 811 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr); 812 813 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "", 814 /* AddressSpace */ 0, &M); 815 } 816 return SrcLocStr; 817 } 818 819 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName, 820 StringRef FileName, 821 unsigned Line, unsigned Column, 822 uint32_t &SrcLocStrSize) { 823 SmallString<128> Buffer; 824 Buffer.push_back(';'); 825 Buffer.append(FileName); 826 Buffer.push_back(';'); 827 Buffer.append(FunctionName); 828 Buffer.push_back(';'); 829 Buffer.append(std::to_string(Line)); 830 Buffer.push_back(';'); 831 Buffer.append(std::to_string(Column)); 832 Buffer.push_back(';'); 833 Buffer.push_back(';'); 834 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize); 835 } 836 837 Constant * 838 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) { 839 StringRef UnknownLoc = ";unknown;unknown;0;0;;"; 840 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize); 841 } 842 843 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, 844 uint32_t &SrcLocStrSize, 845 Function *F) { 846 DILocation *DIL = DL.get(); 847 if (!DIL) 848 return getOrCreateDefaultSrcLocStr(SrcLocStrSize); 849 StringRef FileName = M.getName(); 850 if (DIFile *DIF = DIL->getFile()) 851 if (std::optional<StringRef> Source = DIF->getSource()) 852 FileName = *Source; 853 StringRef Function = DIL->getScope()->getSubprogram()->getName(); 854 if (Function.empty() && F) 855 Function = F->getName(); 856 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(), 857 DIL->getColumn(), SrcLocStrSize); 858 } 859 860 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc, 861 uint32_t &SrcLocStrSize) { 862 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize, 863 Loc.IP.getBlock()->getParent()); 864 } 865 866 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) { 867 return Builder.CreateCall( 868 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident, 869 "omp_global_thread_num"); 870 } 871 872 OpenMPIRBuilder::InsertPointTy 873 OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK, 874 bool ForceSimpleCall, bool CheckCancelFlag) { 875 if (!updateToLocation(Loc)) 876 return Loc.IP; 877 return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag); 878 } 879 880 OpenMPIRBuilder::InsertPointTy 881 OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind, 882 bool ForceSimpleCall, bool CheckCancelFlag) { 883 // Build call __kmpc_cancel_barrier(loc, thread_id) or 884 // __kmpc_barrier(loc, thread_id); 885 886 IdentFlag BarrierLocFlags; 887 switch (Kind) { 888 case OMPD_for: 889 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR; 890 break; 891 case OMPD_sections: 892 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS; 893 break; 894 case OMPD_single: 895 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE; 896 break; 897 case OMPD_barrier: 898 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL; 899 break; 900 default: 901 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL; 902 break; 903 } 904 905 uint32_t SrcLocStrSize; 906 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 907 Value *Args[] = { 908 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags), 909 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))}; 910 911 // If we are in a cancellable parallel region, barriers are cancellation 912 // points. 913 // TODO: Check why we would force simple calls or to ignore the cancel flag. 914 bool UseCancelBarrier = 915 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel); 916 917 Value *Result = 918 Builder.CreateCall(getOrCreateRuntimeFunctionPtr( 919 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier 920 : OMPRTL___kmpc_barrier), 921 Args); 922 923 if (UseCancelBarrier && CheckCancelFlag) 924 emitCancelationCheckImpl(Result, OMPD_parallel); 925 926 return Builder.saveIP(); 927 } 928 929 OpenMPIRBuilder::InsertPointTy 930 OpenMPIRBuilder::createCancel(const LocationDescription &Loc, 931 Value *IfCondition, 932 omp::Directive CanceledDirective) { 933 if (!updateToLocation(Loc)) 934 return Loc.IP; 935 936 // LLVM utilities like blocks with terminators. 937 auto *UI = Builder.CreateUnreachable(); 938 939 Instruction *ThenTI = UI, *ElseTI = nullptr; 940 if (IfCondition) 941 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI); 942 Builder.SetInsertPoint(ThenTI); 943 944 Value *CancelKind = nullptr; 945 switch (CanceledDirective) { 946 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \ 947 case DirectiveEnum: \ 948 CancelKind = Builder.getInt32(Value); \ 949 break; 950 #include "llvm/Frontend/OpenMP/OMPKinds.def" 951 default: 952 llvm_unreachable("Unknown cancel kind!"); 953 } 954 955 uint32_t SrcLocStrSize; 956 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 957 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 958 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind}; 959 Value *Result = Builder.CreateCall( 960 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args); 961 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) { 962 if (CanceledDirective == OMPD_parallel) { 963 IRBuilder<>::InsertPointGuard IPG(Builder); 964 Builder.restoreIP(IP); 965 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 966 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false, 967 /* CheckCancelFlag */ false); 968 } 969 }; 970 971 // The actual cancel logic is shared with others, e.g., cancel_barriers. 972 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB); 973 974 // Update the insertion point and remove the terminator we introduced. 975 Builder.SetInsertPoint(UI->getParent()); 976 UI->eraseFromParent(); 977 978 return Builder.saveIP(); 979 } 980 981 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel( 982 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, 983 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, 984 Value *HostPtr, ArrayRef<Value *> KernelArgs) { 985 if (!updateToLocation(Loc)) 986 return Loc.IP; 987 988 Builder.restoreIP(AllocaIP); 989 auto *KernelArgsPtr = 990 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args"); 991 Builder.restoreIP(Loc.IP); 992 993 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) { 994 llvm::Value *Arg = 995 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I); 996 Builder.CreateAlignedStore( 997 KernelArgs[I], Arg, 998 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType())); 999 } 1000 1001 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams, 1002 NumThreads, HostPtr, KernelArgsPtr}; 1003 1004 Return = Builder.CreateCall( 1005 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel), 1006 OffloadingArgs); 1007 1008 return Builder.saveIP(); 1009 } 1010 1011 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitKernelLaunch( 1012 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, 1013 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args, 1014 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) { 1015 1016 if (!updateToLocation(Loc)) 1017 return Loc.IP; 1018 1019 Builder.restoreIP(Loc.IP); 1020 // On top of the arrays that were filled up, the target offloading call 1021 // takes as arguments the device id as well as the host pointer. The host 1022 // pointer is used by the runtime library to identify the current target 1023 // region, so it only has to be unique and not necessarily point to 1024 // anything. It could be the pointer to the outlined function that 1025 // implements the target region, but we aren't using that so that the 1026 // compiler doesn't need to keep that, and could therefore inline the host 1027 // function if proven worthwhile during optimization. 1028 1029 // From this point on, we need to have an ID of the target region defined. 1030 assert(OutlinedFnID && "Invalid outlined function ID!"); 1031 (void)OutlinedFnID; 1032 1033 // Return value of the runtime offloading call. 1034 Value *Return = nullptr; 1035 1036 // Arguments for the target kernel. 1037 SmallVector<Value *> ArgsVector; 1038 getKernelArgsVector(Args, Builder, ArgsVector); 1039 1040 // The target region is an outlined function launched by the runtime 1041 // via calls to __tgt_target_kernel(). 1042 // 1043 // Note that on the host and CPU targets, the runtime implementation of 1044 // these calls simply call the outlined function without forking threads. 1045 // The outlined functions themselves have runtime calls to 1046 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by 1047 // the compiler in emitTeamsCall() and emitParallelCall(). 1048 // 1049 // In contrast, on the NVPTX target, the implementation of 1050 // __tgt_target_teams() launches a GPU kernel with the requested number 1051 // of teams and threads so no additional calls to the runtime are required. 1052 // Check the error code and execute the host version if required. 1053 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID, 1054 Args.NumTeams, Args.NumThreads, 1055 OutlinedFnID, ArgsVector)); 1056 1057 BasicBlock *OffloadFailedBlock = 1058 BasicBlock::Create(Builder.getContext(), "omp_offload.failed"); 1059 BasicBlock *OffloadContBlock = 1060 BasicBlock::Create(Builder.getContext(), "omp_offload.cont"); 1061 Value *Failed = Builder.CreateIsNotNull(Return); 1062 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock); 1063 1064 auto CurFn = Builder.GetInsertBlock()->getParent(); 1065 emitBlock(OffloadFailedBlock, CurFn); 1066 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP())); 1067 emitBranch(OffloadContBlock); 1068 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true); 1069 return Builder.saveIP(); 1070 } 1071 1072 void OpenMPIRBuilder::emitCancelationCheckImpl(Value *CancelFlag, 1073 omp::Directive CanceledDirective, 1074 FinalizeCallbackTy ExitCB) { 1075 assert(isLastFinalizationInfoCancellable(CanceledDirective) && 1076 "Unexpected cancellation!"); 1077 1078 // For a cancel barrier we create two new blocks. 1079 BasicBlock *BB = Builder.GetInsertBlock(); 1080 BasicBlock *NonCancellationBlock; 1081 if (Builder.GetInsertPoint() == BB->end()) { 1082 // TODO: This branch will not be needed once we moved to the 1083 // OpenMPIRBuilder codegen completely. 1084 NonCancellationBlock = BasicBlock::Create( 1085 BB->getContext(), BB->getName() + ".cont", BB->getParent()); 1086 } else { 1087 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint()); 1088 BB->getTerminator()->eraseFromParent(); 1089 Builder.SetInsertPoint(BB); 1090 } 1091 BasicBlock *CancellationBlock = BasicBlock::Create( 1092 BB->getContext(), BB->getName() + ".cncl", BB->getParent()); 1093 1094 // Jump to them based on the return value. 1095 Value *Cmp = Builder.CreateIsNull(CancelFlag); 1096 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock, 1097 /* TODO weight */ nullptr, nullptr); 1098 1099 // From the cancellation block we finalize all variables and go to the 1100 // post finalization block that is known to the FiniCB callback. 1101 Builder.SetInsertPoint(CancellationBlock); 1102 if (ExitCB) 1103 ExitCB(Builder.saveIP()); 1104 auto &FI = FinalizationStack.back(); 1105 FI.FiniCB(Builder.saveIP()); 1106 1107 // The continuation block is where code generation continues. 1108 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin()); 1109 } 1110 1111 // Callback used to create OpenMP runtime calls to support 1112 // omp parallel clause for the device. 1113 // We need to use this callback to replace call to the OutlinedFn in OuterFn 1114 // by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51) 1115 static void targetParallelCallback( 1116 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, 1117 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, 1118 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, 1119 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) { 1120 // Add some known attributes. 1121 IRBuilder<> &Builder = OMPIRBuilder->Builder; 1122 OutlinedFn.addParamAttr(0, Attribute::NoAlias); 1123 OutlinedFn.addParamAttr(1, Attribute::NoAlias); 1124 OutlinedFn.addParamAttr(0, Attribute::NoUndef); 1125 OutlinedFn.addParamAttr(1, Attribute::NoUndef); 1126 OutlinedFn.addFnAttr(Attribute::NoUnwind); 1127 1128 assert(OutlinedFn.arg_size() >= 2 && 1129 "Expected at least tid and bounded tid as arguments"); 1130 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; 1131 1132 CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); 1133 assert(CI && "Expected call instruction to outlined function"); 1134 CI->getParent()->setName("omp_parallel"); 1135 1136 Builder.SetInsertPoint(CI); 1137 Type *PtrTy = OMPIRBuilder->VoidPtr; 1138 Value *NullPtrValue = Constant::getNullValue(PtrTy); 1139 1140 // Add alloca for kernel args 1141 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP(); 1142 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt()); 1143 AllocaInst *ArgsAlloca = 1144 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars)); 1145 Value *Args = ArgsAlloca; 1146 // Add address space cast if array for storing arguments is not allocated 1147 // in address space 0 1148 if (ArgsAlloca->getAddressSpace()) 1149 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy); 1150 Builder.restoreIP(CurrentIP); 1151 1152 // Store captured vars which are used by kmpc_parallel_51 1153 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) { 1154 Value *V = *(CI->arg_begin() + 2 + Idx); 1155 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64( 1156 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx); 1157 Builder.CreateStore(V, StoreAddress); 1158 } 1159 1160 Value *Cond = 1161 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32) 1162 : Builder.getInt32(1); 1163 1164 // Build kmpc_parallel_51 call 1165 Value *Parallel51CallArgs[] = { 1166 /* identifier*/ Ident, 1167 /* global thread num*/ ThreadID, 1168 /* if expression */ Cond, 1169 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1), 1170 /* Proc bind */ Builder.getInt32(-1), 1171 /* outlined function */ 1172 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr), 1173 /* wrapper function */ NullPtrValue, 1174 /* arguments of the outlined funciton*/ Args, 1175 /* number of arguments */ Builder.getInt64(NumCapturedVars)}; 1176 1177 FunctionCallee RTLFn = 1178 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51); 1179 1180 Builder.CreateCall(RTLFn, Parallel51CallArgs); 1181 1182 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: " 1183 << *Builder.GetInsertBlock()->getParent() << "\n"); 1184 1185 // Initialize the local TID stack location with the argument value. 1186 Builder.SetInsertPoint(PrivTID); 1187 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); 1188 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), 1189 PrivTIDAddr); 1190 1191 // Remove redundant call to the outlined function. 1192 CI->eraseFromParent(); 1193 1194 for (Instruction *I : ToBeDeleted) { 1195 I->eraseFromParent(); 1196 } 1197 } 1198 1199 // Callback used to create OpenMP runtime calls to support 1200 // omp parallel clause for the host. 1201 // We need to use this callback to replace call to the OutlinedFn in OuterFn 1202 // by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if]) 1203 static void 1204 hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, 1205 Function *OuterFn, Value *Ident, Value *IfCondition, 1206 Instruction *PrivTID, AllocaInst *PrivTIDAddr, 1207 const SmallVector<Instruction *, 4> &ToBeDeleted) { 1208 IRBuilder<> &Builder = OMPIRBuilder->Builder; 1209 FunctionCallee RTLFn; 1210 if (IfCondition) { 1211 RTLFn = 1212 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if); 1213 } else { 1214 RTLFn = 1215 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call); 1216 } 1217 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) { 1218 if (!F->hasMetadata(LLVMContext::MD_callback)) { 1219 LLVMContext &Ctx = F->getContext(); 1220 MDBuilder MDB(Ctx); 1221 // Annotate the callback behavior of the __kmpc_fork_call: 1222 // - The callback callee is argument number 2 (microtask). 1223 // - The first two arguments of the callback callee are unknown (-1). 1224 // - All variadic arguments to the __kmpc_fork_call are passed to the 1225 // callback callee. 1226 F->addMetadata(LLVMContext::MD_callback, 1227 *MDNode::get(Ctx, {MDB.createCallbackEncoding( 1228 2, {-1, -1}, 1229 /* VarArgsArePassed */ true)})); 1230 } 1231 } 1232 // Add some known attributes. 1233 OutlinedFn.addParamAttr(0, Attribute::NoAlias); 1234 OutlinedFn.addParamAttr(1, Attribute::NoAlias); 1235 OutlinedFn.addFnAttr(Attribute::NoUnwind); 1236 1237 assert(OutlinedFn.arg_size() >= 2 && 1238 "Expected at least tid and bounded tid as arguments"); 1239 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2; 1240 1241 CallInst *CI = cast<CallInst>(OutlinedFn.user_back()); 1242 CI->getParent()->setName("omp_parallel"); 1243 Builder.SetInsertPoint(CI); 1244 1245 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn); 1246 Value *ForkCallArgs[] = { 1247 Ident, Builder.getInt32(NumCapturedVars), 1248 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)}; 1249 1250 SmallVector<Value *, 16> RealArgs; 1251 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs)); 1252 if (IfCondition) { 1253 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32); 1254 RealArgs.push_back(Cond); 1255 } 1256 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end()); 1257 1258 // __kmpc_fork_call_if always expects a void ptr as the last argument 1259 // If there are no arguments, pass a null pointer. 1260 auto PtrTy = OMPIRBuilder->VoidPtr; 1261 if (IfCondition && NumCapturedVars == 0) { 1262 Value *NullPtrValue = Constant::getNullValue(PtrTy); 1263 RealArgs.push_back(NullPtrValue); 1264 } 1265 if (IfCondition && RealArgs.back()->getType() != PtrTy) 1266 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy); 1267 1268 Builder.CreateCall(RTLFn, RealArgs); 1269 1270 LLVM_DEBUG(dbgs() << "With fork_call placed: " 1271 << *Builder.GetInsertBlock()->getParent() << "\n"); 1272 1273 // Initialize the local TID stack location with the argument value. 1274 Builder.SetInsertPoint(PrivTID); 1275 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin(); 1276 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI), 1277 PrivTIDAddr); 1278 1279 // Remove redundant call to the outlined function. 1280 CI->eraseFromParent(); 1281 1282 for (Instruction *I : ToBeDeleted) { 1283 I->eraseFromParent(); 1284 } 1285 } 1286 1287 IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel( 1288 const LocationDescription &Loc, InsertPointTy OuterAllocaIP, 1289 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, 1290 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, 1291 omp::ProcBindKind ProcBind, bool IsCancellable) { 1292 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); 1293 1294 if (!updateToLocation(Loc)) 1295 return Loc.IP; 1296 1297 uint32_t SrcLocStrSize; 1298 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1299 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1300 Value *ThreadID = getOrCreateThreadID(Ident); 1301 // If we generate code for the target device, we need to allocate 1302 // struct for aggregate params in the device default alloca address space. 1303 // OpenMP runtime requires that the params of the extracted functions are 1304 // passed as zero address space pointers. This flag ensures that extracted 1305 // function arguments are declared in zero address space 1306 bool ArgsInZeroAddressSpace = Config.isTargetDevice(); 1307 1308 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads) 1309 // only if we compile for host side. 1310 if (NumThreads && !Config.isTargetDevice()) { 1311 Value *Args[] = { 1312 Ident, ThreadID, 1313 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)}; 1314 Builder.CreateCall( 1315 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args); 1316 } 1317 1318 if (ProcBind != OMP_PROC_BIND_default) { 1319 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind) 1320 Value *Args[] = { 1321 Ident, ThreadID, 1322 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)}; 1323 Builder.CreateCall( 1324 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args); 1325 } 1326 1327 BasicBlock *InsertBB = Builder.GetInsertBlock(); 1328 Function *OuterFn = InsertBB->getParent(); 1329 1330 // Save the outer alloca block because the insertion iterator may get 1331 // invalidated and we still need this later. 1332 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); 1333 1334 // Vector to remember instructions we used only during the modeling but which 1335 // we want to delete at the end. 1336 SmallVector<Instruction *, 4> ToBeDeleted; 1337 1338 // Change the location to the outer alloca insertion point to create and 1339 // initialize the allocas we pass into the parallel region. 1340 Builder.restoreIP(OuterAllocaIP); 1341 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr"); 1342 AllocaInst *ZeroAddrAlloca = 1343 Builder.CreateAlloca(Int32, nullptr, "zero.addr"); 1344 Instruction *TIDAddr = TIDAddrAlloca; 1345 Instruction *ZeroAddr = ZeroAddrAlloca; 1346 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) { 1347 // Add additional casts to enforce pointers in zero address space 1348 TIDAddr = new AddrSpaceCastInst( 1349 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast"); 1350 TIDAddr->insertAfter(TIDAddrAlloca); 1351 ToBeDeleted.push_back(TIDAddr); 1352 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca, 1353 PointerType ::get(M.getContext(), 0), 1354 "zero.addr.ascast"); 1355 ZeroAddr->insertAfter(ZeroAddrAlloca); 1356 ToBeDeleted.push_back(ZeroAddr); 1357 } 1358 1359 // We only need TIDAddr and ZeroAddr for modeling purposes to get the 1360 // associated arguments in the outlined function, so we delete them later. 1361 ToBeDeleted.push_back(TIDAddrAlloca); 1362 ToBeDeleted.push_back(ZeroAddrAlloca); 1363 1364 // Create an artificial insertion point that will also ensure the blocks we 1365 // are about to split are not degenerated. 1366 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); 1367 1368 BasicBlock *EntryBB = UI->getParent(); 1369 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry"); 1370 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region"); 1371 BasicBlock *PRegPreFiniBB = 1372 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize"); 1373 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit"); 1374 1375 auto FiniCBWrapper = [&](InsertPointTy IP) { 1376 // Hide "open-ended" blocks from the given FiniCB by setting the right jump 1377 // target to the region exit block. 1378 if (IP.getBlock()->end() == IP.getPoint()) { 1379 IRBuilder<>::InsertPointGuard IPG(Builder); 1380 Builder.restoreIP(IP); 1381 Instruction *I = Builder.CreateBr(PRegExitBB); 1382 IP = InsertPointTy(I->getParent(), I->getIterator()); 1383 } 1384 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 && 1385 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB && 1386 "Unexpected insertion point for finalization call!"); 1387 return FiniCB(IP); 1388 }; 1389 1390 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable}); 1391 1392 // Generate the privatization allocas in the block that will become the entry 1393 // of the outlined function. 1394 Builder.SetInsertPoint(PRegEntryBB->getTerminator()); 1395 InsertPointTy InnerAllocaIP = Builder.saveIP(); 1396 1397 AllocaInst *PrivTIDAddr = 1398 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local"); 1399 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid"); 1400 1401 // Add some fake uses for OpenMP provided arguments. 1402 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use")); 1403 Instruction *ZeroAddrUse = 1404 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use"); 1405 ToBeDeleted.push_back(ZeroAddrUse); 1406 1407 // EntryBB 1408 // | 1409 // V 1410 // PRegionEntryBB <- Privatization allocas are placed here. 1411 // | 1412 // V 1413 // PRegionBodyBB <- BodeGen is invoked here. 1414 // | 1415 // V 1416 // PRegPreFiniBB <- The block we will start finalization from. 1417 // | 1418 // V 1419 // PRegionExitBB <- A common exit to simplify block collection. 1420 // 1421 1422 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n"); 1423 1424 // Let the caller create the body. 1425 assert(BodyGenCB && "Expected body generation callback!"); 1426 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin()); 1427 BodyGenCB(InnerAllocaIP, CodeGenIP); 1428 1429 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n"); 1430 1431 OutlineInfo OI; 1432 if (Config.isTargetDevice()) { 1433 // Generate OpenMP target specific runtime call 1434 OI.PostOutlineCB = [=, ToBeDeletedVec = 1435 std::move(ToBeDeleted)](Function &OutlinedFn) { 1436 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident, 1437 IfCondition, NumThreads, PrivTID, PrivTIDAddr, 1438 ThreadID, ToBeDeletedVec); 1439 }; 1440 } else { 1441 // Generate OpenMP host runtime call 1442 OI.PostOutlineCB = [=, ToBeDeletedVec = 1443 std::move(ToBeDeleted)](Function &OutlinedFn) { 1444 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition, 1445 PrivTID, PrivTIDAddr, ToBeDeletedVec); 1446 }; 1447 } 1448 1449 // Adjust the finalization stack, verify the adjustment, and call the 1450 // finalize function a last time to finalize values between the pre-fini 1451 // block and the exit block if we left the parallel "the normal way". 1452 auto FiniInfo = FinalizationStack.pop_back_val(); 1453 (void)FiniInfo; 1454 assert(FiniInfo.DK == OMPD_parallel && 1455 "Unexpected finalization stack state!"); 1456 1457 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator(); 1458 1459 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator()); 1460 FiniCB(PreFiniIP); 1461 1462 OI.OuterAllocaBB = OuterAllocaBlock; 1463 OI.EntryBB = PRegEntryBB; 1464 OI.ExitBB = PRegExitBB; 1465 1466 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 1467 SmallVector<BasicBlock *, 32> Blocks; 1468 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 1469 1470 // Ensure a single exit node for the outlined region by creating one. 1471 // We might have multiple incoming edges to the exit now due to finalizations, 1472 // e.g., cancel calls that cause the control flow to leave the region. 1473 BasicBlock *PRegOutlinedExitBB = PRegExitBB; 1474 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt()); 1475 PRegOutlinedExitBB->setName("omp.par.outlined.exit"); 1476 Blocks.push_back(PRegOutlinedExitBB); 1477 1478 CodeExtractorAnalysisCache CEAC(*OuterFn); 1479 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, 1480 /* AggregateArgs */ false, 1481 /* BlockFrequencyInfo */ nullptr, 1482 /* BranchProbabilityInfo */ nullptr, 1483 /* AssumptionCache */ nullptr, 1484 /* AllowVarArgs */ true, 1485 /* AllowAlloca */ true, 1486 /* AllocationBlock */ OuterAllocaBlock, 1487 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace); 1488 1489 // Find inputs to, outputs from the code region. 1490 BasicBlock *CommonExit = nullptr; 1491 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; 1492 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 1493 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands); 1494 1495 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n"); 1496 1497 FunctionCallee TIDRTLFn = 1498 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num); 1499 1500 auto PrivHelper = [&](Value &V) { 1501 if (&V == TIDAddr || &V == ZeroAddr) { 1502 OI.ExcludeArgsFromAggregate.push_back(&V); 1503 return; 1504 } 1505 1506 SetVector<Use *> Uses; 1507 for (Use &U : V.uses()) 1508 if (auto *UserI = dyn_cast<Instruction>(U.getUser())) 1509 if (ParallelRegionBlockSet.count(UserI->getParent())) 1510 Uses.insert(&U); 1511 1512 // __kmpc_fork_call expects extra arguments as pointers. If the input 1513 // already has a pointer type, everything is fine. Otherwise, store the 1514 // value onto stack and load it back inside the to-be-outlined region. This 1515 // will ensure only the pointer will be passed to the function. 1516 // FIXME: if there are more than 15 trailing arguments, they must be 1517 // additionally packed in a struct. 1518 Value *Inner = &V; 1519 if (!V.getType()->isPointerTy()) { 1520 IRBuilder<>::InsertPointGuard Guard(Builder); 1521 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n"); 1522 1523 Builder.restoreIP(OuterAllocaIP); 1524 Value *Ptr = 1525 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded"); 1526 1527 // Store to stack at end of the block that currently branches to the entry 1528 // block of the to-be-outlined region. 1529 Builder.SetInsertPoint(InsertBB, 1530 InsertBB->getTerminator()->getIterator()); 1531 Builder.CreateStore(&V, Ptr); 1532 1533 // Load back next to allocations in the to-be-outlined region. 1534 Builder.restoreIP(InnerAllocaIP); 1535 Inner = Builder.CreateLoad(V.getType(), Ptr); 1536 } 1537 1538 Value *ReplacementValue = nullptr; 1539 CallInst *CI = dyn_cast<CallInst>(&V); 1540 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) { 1541 ReplacementValue = PrivTID; 1542 } else { 1543 Builder.restoreIP( 1544 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue)); 1545 assert(ReplacementValue && 1546 "Expected copy/create callback to set replacement value!"); 1547 if (ReplacementValue == &V) 1548 return; 1549 } 1550 1551 for (Use *UPtr : Uses) 1552 UPtr->set(ReplacementValue); 1553 }; 1554 1555 // Reset the inner alloca insertion as it will be used for loading the values 1556 // wrapped into pointers before passing them into the to-be-outlined region. 1557 // Configure it to insert immediately after the fake use of zero address so 1558 // that they are available in the generated body and so that the 1559 // OpenMP-related values (thread ID and zero address pointers) remain leading 1560 // in the argument list. 1561 InnerAllocaIP = IRBuilder<>::InsertPoint( 1562 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator()); 1563 1564 // Reset the outer alloca insertion point to the entry of the relevant block 1565 // in case it was invalidated. 1566 OuterAllocaIP = IRBuilder<>::InsertPoint( 1567 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt()); 1568 1569 for (Value *Input : Inputs) { 1570 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n"); 1571 PrivHelper(*Input); 1572 } 1573 LLVM_DEBUG({ 1574 for (Value *Output : Outputs) 1575 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n"); 1576 }); 1577 assert(Outputs.empty() && 1578 "OpenMP outlining should not produce live-out values!"); 1579 1580 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n"); 1581 LLVM_DEBUG({ 1582 for (auto *BB : Blocks) 1583 dbgs() << " PBR: " << BB->getName() << "\n"; 1584 }); 1585 1586 // Register the outlined info. 1587 addOutlineInfo(std::move(OI)); 1588 1589 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end()); 1590 UI->eraseFromParent(); 1591 1592 return AfterIP; 1593 } 1594 1595 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) { 1596 // Build call void __kmpc_flush(ident_t *loc) 1597 uint32_t SrcLocStrSize; 1598 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1599 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)}; 1600 1601 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args); 1602 } 1603 1604 void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) { 1605 if (!updateToLocation(Loc)) 1606 return; 1607 emitFlush(Loc); 1608 } 1609 1610 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) { 1611 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32 1612 // global_tid); 1613 uint32_t SrcLocStrSize; 1614 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1615 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1616 Value *Args[] = {Ident, getOrCreateThreadID(Ident)}; 1617 1618 // Ignore return result until untied tasks are supported. 1619 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), 1620 Args); 1621 } 1622 1623 void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) { 1624 if (!updateToLocation(Loc)) 1625 return; 1626 emitTaskwaitImpl(Loc); 1627 } 1628 1629 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) { 1630 // Build call __kmpc_omp_taskyield(loc, thread_id, 0); 1631 uint32_t SrcLocStrSize; 1632 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1633 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1634 Constant *I32Null = ConstantInt::getNullValue(Int32); 1635 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null}; 1636 1637 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), 1638 Args); 1639 } 1640 1641 void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) { 1642 if (!updateToLocation(Loc)) 1643 return; 1644 emitTaskyieldImpl(Loc); 1645 } 1646 1647 OpenMPIRBuilder::InsertPointTy 1648 OpenMPIRBuilder::createTask(const LocationDescription &Loc, 1649 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, 1650 bool Tied, Value *Final, Value *IfCondition, 1651 SmallVector<DependData> Dependencies) { 1652 1653 if (!updateToLocation(Loc)) 1654 return InsertPointTy(); 1655 1656 uint32_t SrcLocStrSize; 1657 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1658 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1659 // The current basic block is split into four basic blocks. After outlining, 1660 // they will be mapped as follows: 1661 // ``` 1662 // def current_fn() { 1663 // current_basic_block: 1664 // br label %task.exit 1665 // task.exit: 1666 // ; instructions after task 1667 // } 1668 // def outlined_fn() { 1669 // task.alloca: 1670 // br label %task.body 1671 // task.body: 1672 // ret void 1673 // } 1674 // ``` 1675 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit"); 1676 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body"); 1677 BasicBlock *TaskAllocaBB = 1678 splitBB(Builder, /*CreateBranch=*/true, "task.alloca"); 1679 1680 InsertPointTy TaskAllocaIP = 1681 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin()); 1682 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin()); 1683 BodyGenCB(TaskAllocaIP, TaskBodyIP); 1684 1685 OutlineInfo OI; 1686 OI.EntryBB = TaskAllocaBB; 1687 OI.OuterAllocaBB = AllocaIP.getBlock(); 1688 OI.ExitBB = TaskExitBB; 1689 1690 // Add the thread ID argument. 1691 std::stack<Instruction *> ToBeDeleted; 1692 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 1693 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false)); 1694 1695 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies, 1696 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable { 1697 // Replace the Stale CI by appropriate RTL function call. 1698 assert(OutlinedFn.getNumUses() == 1 && 1699 "there must be a single user for the outlined function"); 1700 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 1701 1702 // HasShareds is true if any variables are captured in the outlined region, 1703 // false otherwise. 1704 bool HasShareds = StaleCI->arg_size() > 1; 1705 Builder.SetInsertPoint(StaleCI); 1706 1707 // Gather the arguments for emitting the runtime call for 1708 // @__kmpc_omp_task_alloc 1709 Function *TaskAllocFn = 1710 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc); 1711 1712 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID) 1713 // call. 1714 Value *ThreadID = getOrCreateThreadID(Ident); 1715 1716 // Argument - `flags` 1717 // Task is tied iff (Flags & 1) == 1. 1718 // Task is untied iff (Flags & 1) == 0. 1719 // Task is final iff (Flags & 2) == 2. 1720 // Task is not final iff (Flags & 2) == 0. 1721 // TODO: Handle the other flags. 1722 Value *Flags = Builder.getInt32(Tied); 1723 if (Final) { 1724 Value *FinalFlag = 1725 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0)); 1726 Flags = Builder.CreateOr(FinalFlag, Flags); 1727 } 1728 1729 // Argument - `sizeof_kmp_task_t` (TaskSize) 1730 // Tasksize refers to the size in bytes of kmp_task_t data structure 1731 // including private vars accessed in task. 1732 // TODO: add kmp_task_t_with_privates (privates) 1733 Value *TaskSize = Builder.getInt64( 1734 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8)); 1735 1736 // Argument - `sizeof_shareds` (SharedsSize) 1737 // SharedsSize refers to the shareds array size in the kmp_task_t data 1738 // structure. 1739 Value *SharedsSize = Builder.getInt64(0); 1740 if (HasShareds) { 1741 AllocaInst *ArgStructAlloca = 1742 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1)); 1743 assert(ArgStructAlloca && 1744 "Unable to find the alloca instruction corresponding to arguments " 1745 "for extracted function"); 1746 StructType *ArgStructType = 1747 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType()); 1748 assert(ArgStructType && "Unable to find struct type corresponding to " 1749 "arguments for extracted function"); 1750 SharedsSize = 1751 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); 1752 } 1753 // Emit the @__kmpc_omp_task_alloc runtime call 1754 // The runtime call returns a pointer to an area where the task captured 1755 // variables must be copied before the task is run (TaskData) 1756 CallInst *TaskData = Builder.CreateCall( 1757 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, 1758 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, 1759 /*task_func=*/&OutlinedFn}); 1760 1761 // Copy the arguments for outlined function 1762 if (HasShareds) { 1763 Value *Shareds = StaleCI->getArgOperand(1); 1764 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); 1765 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); 1766 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, 1767 SharedsSize); 1768 } 1769 1770 Value *DepArray = nullptr; 1771 if (Dependencies.size()) { 1772 InsertPointTy OldIP = Builder.saveIP(); 1773 Builder.SetInsertPoint( 1774 &OldIP.getBlock()->getParent()->getEntryBlock().back()); 1775 1776 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size()); 1777 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr"); 1778 1779 unsigned P = 0; 1780 for (const DependData &Dep : Dependencies) { 1781 Value *Base = 1782 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P); 1783 // Store the pointer to the variable 1784 Value *Addr = Builder.CreateStructGEP( 1785 DependInfo, Base, 1786 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr)); 1787 Value *DepValPtr = 1788 Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty()); 1789 Builder.CreateStore(DepValPtr, Addr); 1790 // Store the size of the variable 1791 Value *Size = Builder.CreateStructGEP( 1792 DependInfo, Base, 1793 static_cast<unsigned int>(RTLDependInfoFields::Len)); 1794 Builder.CreateStore(Builder.getInt64(M.getDataLayout().getTypeStoreSize( 1795 Dep.DepValueType)), 1796 Size); 1797 // Store the dependency kind 1798 Value *Flags = Builder.CreateStructGEP( 1799 DependInfo, Base, 1800 static_cast<unsigned int>(RTLDependInfoFields::Flags)); 1801 Builder.CreateStore( 1802 ConstantInt::get(Builder.getInt8Ty(), 1803 static_cast<unsigned int>(Dep.DepKind)), 1804 Flags); 1805 ++P; 1806 } 1807 1808 Builder.restoreIP(OldIP); 1809 } 1810 1811 // In the presence of the `if` clause, the following IR is generated: 1812 // ... 1813 // %data = call @__kmpc_omp_task_alloc(...) 1814 // br i1 %if_condition, label %then, label %else 1815 // then: 1816 // call @__kmpc_omp_task(...) 1817 // br label %exit 1818 // else: 1819 // call @__kmpc_omp_task_begin_if0(...) 1820 // call @outlined_fn(...) 1821 // call @__kmpc_omp_task_complete_if0(...) 1822 // br label %exit 1823 // exit: 1824 // ... 1825 if (IfCondition) { 1826 // `SplitBlockAndInsertIfThenElse` requires the block to have a 1827 // terminator. 1828 splitBB(Builder, /*CreateBranch=*/true, "if.end"); 1829 Instruction *IfTerminator = 1830 Builder.GetInsertPoint()->getParent()->getTerminator(); 1831 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr; 1832 Builder.SetInsertPoint(IfTerminator); 1833 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI, 1834 &ElseTI); 1835 Builder.SetInsertPoint(ElseTI); 1836 Function *TaskBeginFn = 1837 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0); 1838 Function *TaskCompleteFn = 1839 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0); 1840 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData}); 1841 CallInst *CI = nullptr; 1842 if (HasShareds) 1843 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData}); 1844 else 1845 CI = Builder.CreateCall(&OutlinedFn, {ThreadID}); 1846 CI->setDebugLoc(StaleCI->getDebugLoc()); 1847 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData}); 1848 Builder.SetInsertPoint(ThenTI); 1849 } 1850 1851 if (Dependencies.size()) { 1852 Function *TaskFn = 1853 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps); 1854 Builder.CreateCall( 1855 TaskFn, 1856 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()), 1857 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0), 1858 ConstantPointerNull::get(PointerType::getUnqual(M.getContext()))}); 1859 1860 } else { 1861 // Emit the @__kmpc_omp_task runtime call to spawn the task 1862 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task); 1863 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData}); 1864 } 1865 1866 StaleCI->eraseFromParent(); 1867 1868 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin()); 1869 if (HasShareds) { 1870 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); 1871 OutlinedFn.getArg(1)->replaceUsesWithIf( 1872 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); 1873 } 1874 1875 while (!ToBeDeleted.empty()) { 1876 ToBeDeleted.top()->eraseFromParent(); 1877 ToBeDeleted.pop(); 1878 } 1879 }; 1880 1881 addOutlineInfo(std::move(OI)); 1882 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin()); 1883 1884 return Builder.saveIP(); 1885 } 1886 1887 OpenMPIRBuilder::InsertPointTy 1888 OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc, 1889 InsertPointTy AllocaIP, 1890 BodyGenCallbackTy BodyGenCB) { 1891 if (!updateToLocation(Loc)) 1892 return InsertPointTy(); 1893 1894 uint32_t SrcLocStrSize; 1895 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 1896 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 1897 Value *ThreadID = getOrCreateThreadID(Ident); 1898 1899 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup 1900 Function *TaskgroupFn = 1901 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup); 1902 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID}); 1903 1904 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit"); 1905 BodyGenCB(AllocaIP, Builder.saveIP()); 1906 1907 Builder.SetInsertPoint(TaskgroupExitBB); 1908 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup 1909 Function *EndTaskgroupFn = 1910 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup); 1911 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID}); 1912 1913 return Builder.saveIP(); 1914 } 1915 1916 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections( 1917 const LocationDescription &Loc, InsertPointTy AllocaIP, 1918 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB, 1919 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) { 1920 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required"); 1921 1922 if (!updateToLocation(Loc)) 1923 return Loc.IP; 1924 1925 auto FiniCBWrapper = [&](InsertPointTy IP) { 1926 if (IP.getBlock()->end() != IP.getPoint()) 1927 return FiniCB(IP); 1928 // This must be done otherwise any nested constructs using FinalizeOMPRegion 1929 // will fail because that function requires the Finalization Basic Block to 1930 // have a terminator, which is already removed by EmitOMPRegionBody. 1931 // IP is currently at cancelation block. 1932 // We need to backtrack to the condition block to fetch 1933 // the exit block and create a branch from cancelation 1934 // to exit block. 1935 IRBuilder<>::InsertPointGuard IPG(Builder); 1936 Builder.restoreIP(IP); 1937 auto *CaseBB = IP.getBlock()->getSinglePredecessor(); 1938 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 1939 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 1940 Instruction *I = Builder.CreateBr(ExitBB); 1941 IP = InsertPointTy(I->getParent(), I->getIterator()); 1942 return FiniCB(IP); 1943 }; 1944 1945 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable}); 1946 1947 // Each section is emitted as a switch case 1948 // Each finalization callback is handled from clang.EmitOMPSectionDirective() 1949 // -> OMP.createSection() which generates the IR for each section 1950 // Iterate through all sections and emit a switch construct: 1951 // switch (IV) { 1952 // case 0: 1953 // <SectionStmt[0]>; 1954 // break; 1955 // ... 1956 // case <NumSection> - 1: 1957 // <SectionStmt[<NumSection> - 1]>; 1958 // break; 1959 // } 1960 // ... 1961 // section_loop.after: 1962 // <FiniCB>; 1963 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) { 1964 Builder.restoreIP(CodeGenIP); 1965 BasicBlock *Continue = 1966 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after"); 1967 Function *CurFn = Continue->getParent(); 1968 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue); 1969 1970 unsigned CaseNumber = 0; 1971 for (auto SectionCB : SectionCBs) { 1972 BasicBlock *CaseBB = BasicBlock::Create( 1973 M.getContext(), "omp_section_loop.body.case", CurFn, Continue); 1974 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB); 1975 Builder.SetInsertPoint(CaseBB); 1976 BranchInst *CaseEndBr = Builder.CreateBr(Continue); 1977 SectionCB(InsertPointTy(), 1978 {CaseEndBr->getParent(), CaseEndBr->getIterator()}); 1979 CaseNumber++; 1980 } 1981 // remove the existing terminator from body BB since there can be no 1982 // terminators after switch/case 1983 }; 1984 // Loop body ends here 1985 // LowerBound, UpperBound, and STride for createCanonicalLoop 1986 Type *I32Ty = Type::getInt32Ty(M.getContext()); 1987 Value *LB = ConstantInt::get(I32Ty, 0); 1988 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size()); 1989 Value *ST = ConstantInt::get(I32Ty, 1); 1990 llvm::CanonicalLoopInfo *LoopInfo = createCanonicalLoop( 1991 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop"); 1992 InsertPointTy AfterIP = 1993 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait); 1994 1995 // Apply the finalization callback in LoopAfterBB 1996 auto FiniInfo = FinalizationStack.pop_back_val(); 1997 assert(FiniInfo.DK == OMPD_sections && 1998 "Unexpected finalization stack state!"); 1999 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) { 2000 Builder.restoreIP(AfterIP); 2001 BasicBlock *FiniBB = 2002 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini"); 2003 CB(Builder.saveIP()); 2004 AfterIP = {FiniBB, FiniBB->begin()}; 2005 } 2006 2007 return AfterIP; 2008 } 2009 2010 OpenMPIRBuilder::InsertPointTy 2011 OpenMPIRBuilder::createSection(const LocationDescription &Loc, 2012 BodyGenCallbackTy BodyGenCB, 2013 FinalizeCallbackTy FiniCB) { 2014 if (!updateToLocation(Loc)) 2015 return Loc.IP; 2016 2017 auto FiniCBWrapper = [&](InsertPointTy IP) { 2018 if (IP.getBlock()->end() != IP.getPoint()) 2019 return FiniCB(IP); 2020 // This must be done otherwise any nested constructs using FinalizeOMPRegion 2021 // will fail because that function requires the Finalization Basic Block to 2022 // have a terminator, which is already removed by EmitOMPRegionBody. 2023 // IP is currently at cancelation block. 2024 // We need to backtrack to the condition block to fetch 2025 // the exit block and create a branch from cancelation 2026 // to exit block. 2027 IRBuilder<>::InsertPointGuard IPG(Builder); 2028 Builder.restoreIP(IP); 2029 auto *CaseBB = Loc.IP.getBlock(); 2030 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor(); 2031 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1); 2032 Instruction *I = Builder.CreateBr(ExitBB); 2033 IP = InsertPointTy(I->getParent(), I->getIterator()); 2034 return FiniCB(IP); 2035 }; 2036 2037 Directive OMPD = Directive::OMPD_sections; 2038 // Since we are using Finalization Callback here, HasFinalize 2039 // and IsCancellable have to be true 2040 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper, 2041 /*Conditional*/ false, /*hasFinalize*/ true, 2042 /*IsCancellable*/ true); 2043 } 2044 2045 /// Create a function with a unique name and a "void (i8*, i8*)" signature in 2046 /// the given module and return it. 2047 Function *getFreshReductionFunc(Module &M) { 2048 Type *VoidTy = Type::getVoidTy(M.getContext()); 2049 Type *Int8PtrTy = PointerType::getUnqual(M.getContext()); 2050 auto *FuncTy = 2051 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false); 2052 return Function::Create(FuncTy, GlobalVariable::InternalLinkage, 2053 M.getDataLayout().getDefaultGlobalsAddressSpace(), 2054 ".omp.reduction.func", &M); 2055 } 2056 2057 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( 2058 const LocationDescription &Loc, InsertPointTy AllocaIP, 2059 ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) { 2060 for (const ReductionInfo &RI : ReductionInfos) { 2061 (void)RI; 2062 assert(RI.Variable && "expected non-null variable"); 2063 assert(RI.PrivateVariable && "expected non-null private variable"); 2064 assert(RI.ReductionGen && "expected non-null reduction generator callback"); 2065 assert(RI.Variable->getType() == RI.PrivateVariable->getType() && 2066 "expected variables and their private equivalents to have the same " 2067 "type"); 2068 assert(RI.Variable->getType()->isPointerTy() && 2069 "expected variables to be pointers"); 2070 } 2071 2072 if (!updateToLocation(Loc)) 2073 return InsertPointTy(); 2074 2075 BasicBlock *InsertBlock = Loc.IP.getBlock(); 2076 BasicBlock *ContinuationBlock = 2077 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); 2078 InsertBlock->getTerminator()->eraseFromParent(); 2079 2080 // Create and populate array of type-erased pointers to private reduction 2081 // values. 2082 unsigned NumReductions = ReductionInfos.size(); 2083 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions); 2084 Builder.restoreIP(AllocaIP); 2085 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array"); 2086 2087 Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); 2088 2089 for (auto En : enumerate(ReductionInfos)) { 2090 unsigned Index = En.index(); 2091 const ReductionInfo &RI = En.value(); 2092 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64( 2093 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index)); 2094 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr); 2095 } 2096 2097 // Emit a call to the runtime function that orchestrates the reduction. 2098 // Declare the reduction function in the process. 2099 Function *Func = Builder.GetInsertBlock()->getParent(); 2100 Module *Module = Func->getParent(); 2101 uint32_t SrcLocStrSize; 2102 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 2103 bool CanGenerateAtomic = 2104 llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) { 2105 return RI.AtomicReductionGen; 2106 }); 2107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, 2108 CanGenerateAtomic 2109 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE 2110 : IdentFlag(0)); 2111 Value *ThreadId = getOrCreateThreadID(Ident); 2112 Constant *NumVariables = Builder.getInt32(NumReductions); 2113 const DataLayout &DL = Module->getDataLayout(); 2114 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy); 2115 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize); 2116 Function *ReductionFunc = getFreshReductionFunc(*Module); 2117 Value *Lock = getOMPCriticalRegionLock(".reduction"); 2118 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr( 2119 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait 2120 : RuntimeFunction::OMPRTL___kmpc_reduce); 2121 CallInst *ReduceCall = 2122 Builder.CreateCall(ReduceFunc, 2123 {Ident, ThreadId, NumVariables, RedArraySize, RedArray, 2124 ReductionFunc, Lock}, 2125 "reduce"); 2126 2127 // Create final reduction entry blocks for the atomic and non-atomic case. 2128 // Emit IR that dispatches control flow to one of the blocks based on the 2129 // reduction supporting the atomic mode. 2130 BasicBlock *NonAtomicRedBlock = 2131 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func); 2132 BasicBlock *AtomicRedBlock = 2133 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func); 2134 SwitchInst *Switch = 2135 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2); 2136 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock); 2137 Switch->addCase(Builder.getInt32(2), AtomicRedBlock); 2138 2139 // Populate the non-atomic reduction using the elementwise reduction function. 2140 // This loads the elements from the global and private variables and reduces 2141 // them before storing back the result to the global variable. 2142 Builder.SetInsertPoint(NonAtomicRedBlock); 2143 for (auto En : enumerate(ReductionInfos)) { 2144 const ReductionInfo &RI = En.value(); 2145 Type *ValueType = RI.ElementType; 2146 Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable, 2147 "red.value." + Twine(En.index())); 2148 Value *PrivateRedValue = 2149 Builder.CreateLoad(ValueType, RI.PrivateVariable, 2150 "red.private.value." + Twine(En.index())); 2151 Value *Reduced; 2152 Builder.restoreIP( 2153 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced)); 2154 if (!Builder.GetInsertBlock()) 2155 return InsertPointTy(); 2156 Builder.CreateStore(Reduced, RI.Variable); 2157 } 2158 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr( 2159 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait 2160 : RuntimeFunction::OMPRTL___kmpc_end_reduce); 2161 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock}); 2162 Builder.CreateBr(ContinuationBlock); 2163 2164 // Populate the atomic reduction using the atomic elementwise reduction 2165 // function. There are no loads/stores here because they will be happening 2166 // inside the atomic elementwise reduction. 2167 Builder.SetInsertPoint(AtomicRedBlock); 2168 if (CanGenerateAtomic) { 2169 for (const ReductionInfo &RI : ReductionInfos) { 2170 Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType, 2171 RI.Variable, RI.PrivateVariable)); 2172 if (!Builder.GetInsertBlock()) 2173 return InsertPointTy(); 2174 } 2175 Builder.CreateBr(ContinuationBlock); 2176 } else { 2177 Builder.CreateUnreachable(); 2178 } 2179 2180 // Populate the outlined reduction function using the elementwise reduction 2181 // function. Partial values are extracted from the type-erased array of 2182 // pointers to private variables. 2183 BasicBlock *ReductionFuncBlock = 2184 BasicBlock::Create(Module->getContext(), "", ReductionFunc); 2185 Builder.SetInsertPoint(ReductionFuncBlock); 2186 Value *LHSArrayPtr = ReductionFunc->getArg(0); 2187 Value *RHSArrayPtr = ReductionFunc->getArg(1); 2188 2189 for (auto En : enumerate(ReductionInfos)) { 2190 const ReductionInfo &RI = En.value(); 2191 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 2192 RedArrayTy, LHSArrayPtr, 0, En.index()); 2193 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); 2194 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); 2195 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); 2196 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( 2197 RedArrayTy, RHSArrayPtr, 0, En.index()); 2198 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); 2199 Value *RHSPtr = 2200 Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); 2201 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); 2202 Value *Reduced; 2203 Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced)); 2204 if (!Builder.GetInsertBlock()) 2205 return InsertPointTy(); 2206 Builder.CreateStore(Reduced, LHSPtr); 2207 } 2208 Builder.CreateRetVoid(); 2209 2210 Builder.SetInsertPoint(ContinuationBlock); 2211 return Builder.saveIP(); 2212 } 2213 2214 OpenMPIRBuilder::InsertPointTy 2215 OpenMPIRBuilder::createMaster(const LocationDescription &Loc, 2216 BodyGenCallbackTy BodyGenCB, 2217 FinalizeCallbackTy FiniCB) { 2218 2219 if (!updateToLocation(Loc)) 2220 return Loc.IP; 2221 2222 Directive OMPD = Directive::OMPD_master; 2223 uint32_t SrcLocStrSize; 2224 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 2225 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2226 Value *ThreadId = getOrCreateThreadID(Ident); 2227 Value *Args[] = {Ident, ThreadId}; 2228 2229 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master); 2230 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 2231 2232 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master); 2233 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 2234 2235 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 2236 /*Conditional*/ true, /*hasFinalize*/ true); 2237 } 2238 2239 OpenMPIRBuilder::InsertPointTy 2240 OpenMPIRBuilder::createMasked(const LocationDescription &Loc, 2241 BodyGenCallbackTy BodyGenCB, 2242 FinalizeCallbackTy FiniCB, Value *Filter) { 2243 if (!updateToLocation(Loc)) 2244 return Loc.IP; 2245 2246 Directive OMPD = Directive::OMPD_masked; 2247 uint32_t SrcLocStrSize; 2248 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 2249 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2250 Value *ThreadId = getOrCreateThreadID(Ident); 2251 Value *Args[] = {Ident, ThreadId, Filter}; 2252 Value *ArgsEnd[] = {Ident, ThreadId}; 2253 2254 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked); 2255 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 2256 2257 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked); 2258 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd); 2259 2260 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 2261 /*Conditional*/ true, /*hasFinalize*/ true); 2262 } 2263 2264 CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( 2265 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, 2266 BasicBlock *PostInsertBefore, const Twine &Name) { 2267 Module *M = F->getParent(); 2268 LLVMContext &Ctx = M->getContext(); 2269 Type *IndVarTy = TripCount->getType(); 2270 2271 // Create the basic block structure. 2272 BasicBlock *Preheader = 2273 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore); 2274 BasicBlock *Header = 2275 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore); 2276 BasicBlock *Cond = 2277 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore); 2278 BasicBlock *Body = 2279 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore); 2280 BasicBlock *Latch = 2281 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore); 2282 BasicBlock *Exit = 2283 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore); 2284 BasicBlock *After = 2285 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore); 2286 2287 // Use specified DebugLoc for new instructions. 2288 Builder.SetCurrentDebugLocation(DL); 2289 2290 Builder.SetInsertPoint(Preheader); 2291 Builder.CreateBr(Header); 2292 2293 Builder.SetInsertPoint(Header); 2294 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv"); 2295 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader); 2296 Builder.CreateBr(Cond); 2297 2298 Builder.SetInsertPoint(Cond); 2299 Value *Cmp = 2300 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp"); 2301 Builder.CreateCondBr(Cmp, Body, Exit); 2302 2303 Builder.SetInsertPoint(Body); 2304 Builder.CreateBr(Latch); 2305 2306 Builder.SetInsertPoint(Latch); 2307 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1), 2308 "omp_" + Name + ".next", /*HasNUW=*/true); 2309 Builder.CreateBr(Header); 2310 IndVarPHI->addIncoming(Next, Latch); 2311 2312 Builder.SetInsertPoint(Exit); 2313 Builder.CreateBr(After); 2314 2315 // Remember and return the canonical control flow. 2316 LoopInfos.emplace_front(); 2317 CanonicalLoopInfo *CL = &LoopInfos.front(); 2318 2319 CL->Header = Header; 2320 CL->Cond = Cond; 2321 CL->Latch = Latch; 2322 CL->Exit = Exit; 2323 2324 #ifndef NDEBUG 2325 CL->assertOK(); 2326 #endif 2327 return CL; 2328 } 2329 2330 CanonicalLoopInfo * 2331 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, 2332 LoopBodyGenCallbackTy BodyGenCB, 2333 Value *TripCount, const Twine &Name) { 2334 BasicBlock *BB = Loc.IP.getBlock(); 2335 BasicBlock *NextBB = BB->getNextNode(); 2336 2337 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(), 2338 NextBB, NextBB, Name); 2339 BasicBlock *After = CL->getAfter(); 2340 2341 // If location is not set, don't connect the loop. 2342 if (updateToLocation(Loc)) { 2343 // Split the loop at the insertion point: Branch to the preheader and move 2344 // every following instruction to after the loop (the After BB). Also, the 2345 // new successor is the loop's after block. 2346 spliceBB(Builder, After, /*CreateBranch=*/false); 2347 Builder.CreateBr(CL->getPreheader()); 2348 } 2349 2350 // Emit the body content. We do it after connecting the loop to the CFG to 2351 // avoid that the callback encounters degenerate BBs. 2352 BodyGenCB(CL->getBodyIP(), CL->getIndVar()); 2353 2354 #ifndef NDEBUG 2355 CL->assertOK(); 2356 #endif 2357 return CL; 2358 } 2359 2360 CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop( 2361 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, 2362 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, 2363 InsertPointTy ComputeIP, const Twine &Name) { 2364 2365 // Consider the following difficulties (assuming 8-bit signed integers): 2366 // * Adding \p Step to the loop counter which passes \p Stop may overflow: 2367 // DO I = 1, 100, 50 2368 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction: 2369 // DO I = 100, 0, -128 2370 2371 // Start, Stop and Step must be of the same integer type. 2372 auto *IndVarTy = cast<IntegerType>(Start->getType()); 2373 assert(IndVarTy == Stop->getType() && "Stop type mismatch"); 2374 assert(IndVarTy == Step->getType() && "Step type mismatch"); 2375 2376 LocationDescription ComputeLoc = 2377 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc; 2378 updateToLocation(ComputeLoc); 2379 2380 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0); 2381 ConstantInt *One = ConstantInt::get(IndVarTy, 1); 2382 2383 // Like Step, but always positive. 2384 Value *Incr = Step; 2385 2386 // Distance between Start and Stop; always positive. 2387 Value *Span; 2388 2389 // Condition whether there are no iterations are executed at all, e.g. because 2390 // UB < LB. 2391 Value *ZeroCmp; 2392 2393 if (IsSigned) { 2394 // Ensure that increment is positive. If not, negate and invert LB and UB. 2395 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero); 2396 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step); 2397 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start); 2398 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop); 2399 Span = Builder.CreateSub(UB, LB, "", false, true); 2400 ZeroCmp = Builder.CreateICmp( 2401 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB); 2402 } else { 2403 Span = Builder.CreateSub(Stop, Start, "", true); 2404 ZeroCmp = Builder.CreateICmp( 2405 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start); 2406 } 2407 2408 Value *CountIfLooping; 2409 if (InclusiveStop) { 2410 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One); 2411 } else { 2412 // Avoid incrementing past stop since it could overflow. 2413 Value *CountIfTwo = Builder.CreateAdd( 2414 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One); 2415 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr); 2416 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo); 2417 } 2418 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping, 2419 "omp_" + Name + ".tripcount"); 2420 2421 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) { 2422 Builder.restoreIP(CodeGenIP); 2423 Value *Span = Builder.CreateMul(IV, Step); 2424 Value *IndVar = Builder.CreateAdd(Span, Start); 2425 BodyGenCB(Builder.saveIP(), IndVar); 2426 }; 2427 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP(); 2428 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name); 2429 } 2430 2431 // Returns an LLVM function to call for initializing loop bounds using OpenMP 2432 // static scheduling depending on `type`. Only i32 and i64 are supported by the 2433 // runtime. Always interpret integers as unsigned similarly to 2434 // CanonicalLoopInfo. 2435 static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, 2436 OpenMPIRBuilder &OMPBuilder) { 2437 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2438 if (Bitwidth == 32) 2439 return OMPBuilder.getOrCreateRuntimeFunction( 2440 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u); 2441 if (Bitwidth == 64) 2442 return OMPBuilder.getOrCreateRuntimeFunction( 2443 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u); 2444 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2445 } 2446 2447 OpenMPIRBuilder::InsertPointTy 2448 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, 2449 InsertPointTy AllocaIP, 2450 bool NeedsBarrier) { 2451 assert(CLI->isValid() && "Requires a valid canonical loop"); 2452 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 2453 "Require dedicated allocate IP"); 2454 2455 // Set up the source location value for OpenMP runtime. 2456 Builder.restoreIP(CLI->getPreheaderIP()); 2457 Builder.SetCurrentDebugLocation(DL); 2458 2459 uint32_t SrcLocStrSize; 2460 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 2461 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2462 2463 // Declare useful OpenMP runtime functions. 2464 Value *IV = CLI->getIndVar(); 2465 Type *IVTy = IV->getType(); 2466 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this); 2467 FunctionCallee StaticFini = 2468 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 2469 2470 // Allocate space for computed loop bounds as expected by the "init" function. 2471 Builder.restoreIP(AllocaIP); 2472 Type *I32Type = Type::getInt32Ty(M.getContext()); 2473 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 2474 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 2475 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 2476 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 2477 2478 // At the end of the preheader, prepare for calling the "init" function by 2479 // storing the current loop bounds into the allocated space. A canonical loop 2480 // always iterates from 0 to trip-count with step 1. Note that "init" expects 2481 // and produces an inclusive upper bound. 2482 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 2483 Constant *Zero = ConstantInt::get(IVTy, 0); 2484 Constant *One = ConstantInt::get(IVTy, 1); 2485 Builder.CreateStore(Zero, PLowerBound); 2486 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One); 2487 Builder.CreateStore(UpperBound, PUpperBound); 2488 Builder.CreateStore(One, PStride); 2489 2490 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 2491 2492 Constant *SchedulingType = ConstantInt::get( 2493 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic)); 2494 2495 // Call the "init" function and update the trip count of the loop with the 2496 // value it produced. 2497 Builder.CreateCall(StaticInit, 2498 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, 2499 PUpperBound, PStride, One, Zero}); 2500 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound); 2501 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound); 2502 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound); 2503 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One); 2504 CLI->setTripCount(TripCount); 2505 2506 // Update all uses of the induction variable except the one in the condition 2507 // block that compares it with the actual upper bound, and the increment in 2508 // the latch block. 2509 2510 CLI->mapIndVar([&](Instruction *OldIV) -> Value * { 2511 Builder.SetInsertPoint(CLI->getBody(), 2512 CLI->getBody()->getFirstInsertionPt()); 2513 Builder.SetCurrentDebugLocation(DL); 2514 return Builder.CreateAdd(OldIV, LowerBound); 2515 }); 2516 2517 // In the "exit" block, call the "fini" function. 2518 Builder.SetInsertPoint(CLI->getExit(), 2519 CLI->getExit()->getTerminator()->getIterator()); 2520 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 2521 2522 // Add the barrier if requested. 2523 if (NeedsBarrier) 2524 createBarrier(LocationDescription(Builder.saveIP(), DL), 2525 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 2526 /* CheckCancelFlag */ false); 2527 2528 InsertPointTy AfterIP = CLI->getAfterIP(); 2529 CLI->invalidate(); 2530 2531 return AfterIP; 2532 } 2533 2534 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop( 2535 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 2536 bool NeedsBarrier, Value *ChunkSize) { 2537 assert(CLI->isValid() && "Requires a valid canonical loop"); 2538 assert(ChunkSize && "Chunk size is required"); 2539 2540 LLVMContext &Ctx = CLI->getFunction()->getContext(); 2541 Value *IV = CLI->getIndVar(); 2542 Value *OrigTripCount = CLI->getTripCount(); 2543 Type *IVTy = IV->getType(); 2544 assert(IVTy->getIntegerBitWidth() <= 64 && 2545 "Max supported tripcount bitwidth is 64 bits"); 2546 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx) 2547 : Type::getInt64Ty(Ctx); 2548 Type *I32Type = Type::getInt32Ty(M.getContext()); 2549 Constant *Zero = ConstantInt::get(InternalIVTy, 0); 2550 Constant *One = ConstantInt::get(InternalIVTy, 1); 2551 2552 // Declare useful OpenMP runtime functions. 2553 FunctionCallee StaticInit = 2554 getKmpcForStaticInitForType(InternalIVTy, M, *this); 2555 FunctionCallee StaticFini = 2556 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini); 2557 2558 // Allocate space for computed loop bounds as expected by the "init" function. 2559 Builder.restoreIP(AllocaIP); 2560 Builder.SetCurrentDebugLocation(DL); 2561 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 2562 Value *PLowerBound = 2563 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound"); 2564 Value *PUpperBound = 2565 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound"); 2566 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride"); 2567 2568 // Set up the source location value for the OpenMP runtime. 2569 Builder.restoreIP(CLI->getPreheaderIP()); 2570 Builder.SetCurrentDebugLocation(DL); 2571 2572 // TODO: Detect overflow in ubsan or max-out with current tripcount. 2573 Value *CastedChunkSize = 2574 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize"); 2575 Value *CastedTripCount = 2576 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount"); 2577 2578 Constant *SchedulingType = ConstantInt::get( 2579 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked)); 2580 Builder.CreateStore(Zero, PLowerBound); 2581 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One); 2582 Builder.CreateStore(OrigUpperBound, PUpperBound); 2583 Builder.CreateStore(One, PStride); 2584 2585 // Call the "init" function and update the trip count of the loop with the 2586 // value it produced. 2587 uint32_t SrcLocStrSize; 2588 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 2589 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2590 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 2591 Builder.CreateCall(StaticInit, 2592 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum, 2593 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter, 2594 /*plower=*/PLowerBound, /*pupper=*/PUpperBound, 2595 /*pstride=*/PStride, /*incr=*/One, 2596 /*chunk=*/CastedChunkSize}); 2597 2598 // Load values written by the "init" function. 2599 Value *FirstChunkStart = 2600 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb"); 2601 Value *FirstChunkStop = 2602 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub"); 2603 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One); 2604 Value *ChunkRange = 2605 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range"); 2606 Value *NextChunkStride = 2607 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride"); 2608 2609 // Create outer "dispatch" loop for enumerating the chunks. 2610 BasicBlock *DispatchEnter = splitBB(Builder, true); 2611 Value *DispatchCounter; 2612 CanonicalLoopInfo *DispatchCLI = createCanonicalLoop( 2613 {Builder.saveIP(), DL}, 2614 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; }, 2615 FirstChunkStart, CastedTripCount, NextChunkStride, 2616 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{}, 2617 "dispatch"); 2618 2619 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to 2620 // not have to preserve the canonical invariant. 2621 BasicBlock *DispatchBody = DispatchCLI->getBody(); 2622 BasicBlock *DispatchLatch = DispatchCLI->getLatch(); 2623 BasicBlock *DispatchExit = DispatchCLI->getExit(); 2624 BasicBlock *DispatchAfter = DispatchCLI->getAfter(); 2625 DispatchCLI->invalidate(); 2626 2627 // Rewire the original loop to become the chunk loop inside the dispatch loop. 2628 redirectTo(DispatchAfter, CLI->getAfter(), DL); 2629 redirectTo(CLI->getExit(), DispatchLatch, DL); 2630 redirectTo(DispatchBody, DispatchEnter, DL); 2631 2632 // Prepare the prolog of the chunk loop. 2633 Builder.restoreIP(CLI->getPreheaderIP()); 2634 Builder.SetCurrentDebugLocation(DL); 2635 2636 // Compute the number of iterations of the chunk loop. 2637 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); 2638 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange); 2639 Value *IsLastChunk = 2640 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last"); 2641 Value *CountUntilOrigTripCount = 2642 Builder.CreateSub(CastedTripCount, DispatchCounter); 2643 Value *ChunkTripCount = Builder.CreateSelect( 2644 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount"); 2645 Value *BackcastedChunkTC = 2646 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc"); 2647 CLI->setTripCount(BackcastedChunkTC); 2648 2649 // Update all uses of the induction variable except the one in the condition 2650 // block that compares it with the actual upper bound, and the increment in 2651 // the latch block. 2652 Value *BackcastedDispatchCounter = 2653 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc"); 2654 CLI->mapIndVar([&](Instruction *) -> Value * { 2655 Builder.restoreIP(CLI->getBodyIP()); 2656 return Builder.CreateAdd(IV, BackcastedDispatchCounter); 2657 }); 2658 2659 // In the "exit" block, call the "fini" function. 2660 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt()); 2661 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum}); 2662 2663 // Add the barrier if requested. 2664 if (NeedsBarrier) 2665 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for, 2666 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false); 2667 2668 #ifndef NDEBUG 2669 // Even though we currently do not support applying additional methods to it, 2670 // the chunk loop should remain a canonical loop. 2671 CLI->assertOK(); 2672 #endif 2673 2674 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()}; 2675 } 2676 2677 // Returns an LLVM function to call for executing an OpenMP static worksharing 2678 // for loop depending on `type`. Only i32 and i64 are supported by the runtime. 2679 // Always interpret integers as unsigned similarly to CanonicalLoopInfo. 2680 static FunctionCallee 2681 getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, 2682 WorksharingLoopType LoopType) { 2683 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2684 Module &M = OMPBuilder->M; 2685 switch (LoopType) { 2686 case WorksharingLoopType::ForStaticLoop: 2687 if (Bitwidth == 32) 2688 return OMPBuilder->getOrCreateRuntimeFunction( 2689 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u); 2690 if (Bitwidth == 64) 2691 return OMPBuilder->getOrCreateRuntimeFunction( 2692 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u); 2693 break; 2694 case WorksharingLoopType::DistributeStaticLoop: 2695 if (Bitwidth == 32) 2696 return OMPBuilder->getOrCreateRuntimeFunction( 2697 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u); 2698 if (Bitwidth == 64) 2699 return OMPBuilder->getOrCreateRuntimeFunction( 2700 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u); 2701 break; 2702 case WorksharingLoopType::DistributeForStaticLoop: 2703 if (Bitwidth == 32) 2704 return OMPBuilder->getOrCreateRuntimeFunction( 2705 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u); 2706 if (Bitwidth == 64) 2707 return OMPBuilder->getOrCreateRuntimeFunction( 2708 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u); 2709 break; 2710 } 2711 if (Bitwidth != 32 && Bitwidth != 64) { 2712 llvm_unreachable("Unknown OpenMP loop iterator bitwidth"); 2713 } 2714 llvm_unreachable("Unknown type of OpenMP worksharing loop"); 2715 } 2716 2717 // Inserts a call to proper OpenMP Device RTL function which handles 2718 // loop worksharing. 2719 static void createTargetLoopWorkshareCall( 2720 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, 2721 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, 2722 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) { 2723 Type *TripCountTy = TripCount->getType(); 2724 Module &M = OMPBuilder->M; 2725 IRBuilder<> &Builder = OMPBuilder->Builder; 2726 FunctionCallee RTLFn = 2727 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType); 2728 SmallVector<Value *, 8> RealArgs; 2729 RealArgs.push_back(Ident); 2730 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr)); 2731 RealArgs.push_back(LoopBodyArg); 2732 RealArgs.push_back(TripCount); 2733 if (LoopType == WorksharingLoopType::DistributeStaticLoop) { 2734 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 2735 Builder.CreateCall(RTLFn, RealArgs); 2736 return; 2737 } 2738 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction( 2739 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads); 2740 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); 2741 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {}); 2742 2743 RealArgs.push_back( 2744 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast")); 2745 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 2746 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { 2747 RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); 2748 } 2749 2750 Builder.CreateCall(RTLFn, RealArgs); 2751 } 2752 2753 static void 2754 workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, 2755 CanonicalLoopInfo *CLI, Value *Ident, 2756 Function &OutlinedFn, Type *ParallelTaskPtr, 2757 const SmallVector<Instruction *, 4> &ToBeDeleted, 2758 WorksharingLoopType LoopType) { 2759 IRBuilder<> &Builder = OMPIRBuilder->Builder; 2760 BasicBlock *Preheader = CLI->getPreheader(); 2761 Value *TripCount = CLI->getTripCount(); 2762 2763 // After loop body outling, the loop body contains only set up 2764 // of loop body argument structure and the call to the outlined 2765 // loop body function. Firstly, we need to move setup of loop body args 2766 // into loop preheader. 2767 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(), 2768 CLI->getBody()->begin(), std::prev(CLI->getBody()->end())); 2769 2770 // The next step is to remove the whole loop. We do not it need anymore. 2771 // That's why make an unconditional branch from loop preheader to loop 2772 // exit block 2773 Builder.restoreIP({Preheader, Preheader->end()}); 2774 Preheader->getTerminator()->eraseFromParent(); 2775 Builder.CreateBr(CLI->getExit()); 2776 2777 // Delete dead loop blocks 2778 OpenMPIRBuilder::OutlineInfo CleanUpInfo; 2779 SmallPtrSet<BasicBlock *, 32> RegionBlockSet; 2780 SmallVector<BasicBlock *, 32> BlocksToBeRemoved; 2781 CleanUpInfo.EntryBB = CLI->getHeader(); 2782 CleanUpInfo.ExitBB = CLI->getExit(); 2783 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved); 2784 DeleteDeadBlocks(BlocksToBeRemoved); 2785 2786 // Find the instruction which corresponds to loop body argument structure 2787 // and remove the call to loop body function instruction. 2788 Value *LoopBodyArg; 2789 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser(); 2790 assert(OutlinedFnUser && 2791 "Expected unique undroppable user of outlined function"); 2792 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser); 2793 assert(OutlinedFnCallInstruction && "Expected outlined function call"); 2794 assert((OutlinedFnCallInstruction->getParent() == Preheader) && 2795 "Expected outlined function call to be located in loop preheader"); 2796 // Check in case no argument structure has been passed. 2797 if (OutlinedFnCallInstruction->arg_size() > 1) 2798 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1); 2799 else 2800 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy()); 2801 OutlinedFnCallInstruction->eraseFromParent(); 2802 2803 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, 2804 LoopBodyArg, ParallelTaskPtr, TripCount, 2805 OutlinedFn); 2806 2807 for (auto &ToBeDeletedItem : ToBeDeleted) 2808 ToBeDeletedItem->eraseFromParent(); 2809 CLI->invalidate(); 2810 } 2811 2812 OpenMPIRBuilder::InsertPointTy 2813 OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, 2814 InsertPointTy AllocaIP, 2815 WorksharingLoopType LoopType) { 2816 uint32_t SrcLocStrSize; 2817 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 2818 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 2819 2820 OutlineInfo OI; 2821 OI.OuterAllocaBB = CLI->getPreheader(); 2822 Function *OuterFn = CLI->getPreheader()->getParent(); 2823 2824 // Instructions which need to be deleted at the end of code generation 2825 SmallVector<Instruction *, 4> ToBeDeleted; 2826 2827 OI.OuterAllocaBB = AllocaIP.getBlock(); 2828 2829 // Mark the body loop as region which needs to be extracted 2830 OI.EntryBB = CLI->getBody(); 2831 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(), 2832 "omp.prelatch", true); 2833 2834 // Prepare loop body for extraction 2835 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()}); 2836 2837 // Insert new loop counter variable which will be used only in loop 2838 // body. 2839 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, ""); 2840 Instruction *NewLoopCntLoad = 2841 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt); 2842 // New loop counter instructions are redundant in the loop preheader when 2843 // code generation for workshare loop is finshed. That's why mark them as 2844 // ready for deletion. 2845 ToBeDeleted.push_back(NewLoopCntLoad); 2846 ToBeDeleted.push_back(NewLoopCnt); 2847 2848 // Analyse loop body region. Find all input variables which are used inside 2849 // loop body region. 2850 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet; 2851 SmallVector<BasicBlock *, 32> Blocks; 2852 OI.collectBlocks(ParallelRegionBlockSet, Blocks); 2853 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(), 2854 ParallelRegionBlockSet.end()); 2855 2856 CodeExtractorAnalysisCache CEAC(*OuterFn); 2857 CodeExtractor Extractor(Blocks, 2858 /* DominatorTree */ nullptr, 2859 /* AggregateArgs */ true, 2860 /* BlockFrequencyInfo */ nullptr, 2861 /* BranchProbabilityInfo */ nullptr, 2862 /* AssumptionCache */ nullptr, 2863 /* AllowVarArgs */ true, 2864 /* AllowAlloca */ true, 2865 /* AllocationBlock */ CLI->getPreheader(), 2866 /* Suffix */ ".omp_wsloop", 2867 /* AggrArgsIn0AddrSpace */ true); 2868 2869 BasicBlock *CommonExit = nullptr; 2870 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands; 2871 2872 // Find allocas outside the loop body region which are used inside loop 2873 // body 2874 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); 2875 2876 // We need to model loop body region as the function f(cnt, loop_arg). 2877 // That's why we replace loop induction variable by the new counter 2878 // which will be one of loop body function argument 2879 for (auto Use = CLI->getIndVar()->user_begin(); 2880 Use != CLI->getIndVar()->user_end(); ++Use) { 2881 if (Instruction *Inst = dyn_cast<Instruction>(*Use)) { 2882 if (ParallelRegionBlockSet.count(Inst->getParent())) { 2883 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad); 2884 } 2885 } 2886 } 2887 // Make sure that loop counter variable is not merged into loop body 2888 // function argument structure and it is passed as separate variable 2889 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad); 2890 2891 // PostOutline CB is invoked when loop body function is outlined and 2892 // loop body is replaced by call to outlined function. We need to add 2893 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl 2894 // function will handle loop control logic. 2895 // 2896 OI.PostOutlineCB = [=, ToBeDeletedVec = 2897 std::move(ToBeDeleted)](Function &OutlinedFn) { 2898 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr, 2899 ToBeDeletedVec, LoopType); 2900 }; 2901 addOutlineInfo(std::move(OI)); 2902 return CLI->getAfterIP(); 2903 } 2904 2905 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoop( 2906 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 2907 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, 2908 bool HasSimdModifier, bool HasMonotonicModifier, 2909 bool HasNonmonotonicModifier, bool HasOrderedClause, 2910 WorksharingLoopType LoopType) { 2911 if (Config.isTargetDevice()) 2912 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); 2913 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( 2914 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, 2915 HasNonmonotonicModifier, HasOrderedClause); 2916 2917 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) == 2918 OMPScheduleType::ModifierOrdered; 2919 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) { 2920 case OMPScheduleType::BaseStatic: 2921 assert(!ChunkSize && "No chunk size with static-chunked schedule"); 2922 if (IsOrdered) 2923 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2924 NeedsBarrier, ChunkSize); 2925 // FIXME: Monotonicity ignored? 2926 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier); 2927 2928 case OMPScheduleType::BaseStaticChunked: 2929 if (IsOrdered) 2930 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2931 NeedsBarrier, ChunkSize); 2932 // FIXME: Monotonicity ignored? 2933 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier, 2934 ChunkSize); 2935 2936 case OMPScheduleType::BaseRuntime: 2937 case OMPScheduleType::BaseAuto: 2938 case OMPScheduleType::BaseGreedy: 2939 case OMPScheduleType::BaseBalanced: 2940 case OMPScheduleType::BaseSteal: 2941 case OMPScheduleType::BaseGuidedSimd: 2942 case OMPScheduleType::BaseRuntimeSimd: 2943 assert(!ChunkSize && 2944 "schedule type does not support user-defined chunk sizes"); 2945 [[fallthrough]]; 2946 case OMPScheduleType::BaseDynamicChunked: 2947 case OMPScheduleType::BaseGuidedChunked: 2948 case OMPScheduleType::BaseGuidedIterativeChunked: 2949 case OMPScheduleType::BaseGuidedAnalyticalChunked: 2950 case OMPScheduleType::BaseStaticBalancedChunked: 2951 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType, 2952 NeedsBarrier, ChunkSize); 2953 2954 default: 2955 llvm_unreachable("Unknown/unimplemented schedule kind"); 2956 } 2957 } 2958 2959 /// Returns an LLVM function to call for initializing loop bounds using OpenMP 2960 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 2961 /// the runtime. Always interpret integers as unsigned similarly to 2962 /// CanonicalLoopInfo. 2963 static FunctionCallee 2964 getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2965 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2966 if (Bitwidth == 32) 2967 return OMPBuilder.getOrCreateRuntimeFunction( 2968 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); 2969 if (Bitwidth == 64) 2970 return OMPBuilder.getOrCreateRuntimeFunction( 2971 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); 2972 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2973 } 2974 2975 /// Returns an LLVM function to call for updating the next loop using OpenMP 2976 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by 2977 /// the runtime. Always interpret integers as unsigned similarly to 2978 /// CanonicalLoopInfo. 2979 static FunctionCallee 2980 getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2981 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2982 if (Bitwidth == 32) 2983 return OMPBuilder.getOrCreateRuntimeFunction( 2984 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); 2985 if (Bitwidth == 64) 2986 return OMPBuilder.getOrCreateRuntimeFunction( 2987 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); 2988 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 2989 } 2990 2991 /// Returns an LLVM function to call for finalizing the dynamic loop using 2992 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always 2993 /// interpret integers as unsigned similarly to CanonicalLoopInfo. 2994 static FunctionCallee 2995 getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { 2996 unsigned Bitwidth = Ty->getIntegerBitWidth(); 2997 if (Bitwidth == 32) 2998 return OMPBuilder.getOrCreateRuntimeFunction( 2999 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u); 3000 if (Bitwidth == 64) 3001 return OMPBuilder.getOrCreateRuntimeFunction( 3002 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u); 3003 llvm_unreachable("unknown OpenMP loop iterator bitwidth"); 3004 } 3005 3006 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop( 3007 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, 3008 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) { 3009 assert(CLI->isValid() && "Requires a valid canonical loop"); 3010 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) && 3011 "Require dedicated allocate IP"); 3012 assert(isValidWorkshareLoopScheduleType(SchedType) && 3013 "Require valid schedule type"); 3014 3015 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) == 3016 OMPScheduleType::ModifierOrdered; 3017 3018 // Set up the source location value for OpenMP runtime. 3019 Builder.SetCurrentDebugLocation(DL); 3020 3021 uint32_t SrcLocStrSize; 3022 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); 3023 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3024 3025 // Declare useful OpenMP runtime functions. 3026 Value *IV = CLI->getIndVar(); 3027 Type *IVTy = IV->getType(); 3028 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); 3029 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); 3030 3031 // Allocate space for computed loop bounds as expected by the "init" function. 3032 Builder.restoreIP(AllocaIP); 3033 Type *I32Type = Type::getInt32Ty(M.getContext()); 3034 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); 3035 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); 3036 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); 3037 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); 3038 3039 // At the end of the preheader, prepare for calling the "init" function by 3040 // storing the current loop bounds into the allocated space. A canonical loop 3041 // always iterates from 0 to trip-count with step 1. Note that "init" expects 3042 // and produces an inclusive upper bound. 3043 BasicBlock *PreHeader = CLI->getPreheader(); 3044 Builder.SetInsertPoint(PreHeader->getTerminator()); 3045 Constant *One = ConstantInt::get(IVTy, 1); 3046 Builder.CreateStore(One, PLowerBound); 3047 Value *UpperBound = CLI->getTripCount(); 3048 Builder.CreateStore(UpperBound, PUpperBound); 3049 Builder.CreateStore(One, PStride); 3050 3051 BasicBlock *Header = CLI->getHeader(); 3052 BasicBlock *Exit = CLI->getExit(); 3053 BasicBlock *Cond = CLI->getCond(); 3054 BasicBlock *Latch = CLI->getLatch(); 3055 InsertPointTy AfterIP = CLI->getAfterIP(); 3056 3057 // The CLI will be "broken" in the code below, as the loop is no longer 3058 // a valid canonical loop. 3059 3060 if (!Chunk) 3061 Chunk = One; 3062 3063 Value *ThreadNum = getOrCreateThreadID(SrcLoc); 3064 3065 Constant *SchedulingType = 3066 ConstantInt::get(I32Type, static_cast<int>(SchedType)); 3067 3068 // Call the "init" function. 3069 Builder.CreateCall(DynamicInit, 3070 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One, 3071 UpperBound, /* step */ One, Chunk}); 3072 3073 // An outer loop around the existing one. 3074 BasicBlock *OuterCond = BasicBlock::Create( 3075 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond", 3076 PreHeader->getParent()); 3077 // This needs to be 32-bit always, so can't use the IVTy Zero above. 3078 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt()); 3079 Value *Res = 3080 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, 3081 PLowerBound, PUpperBound, PStride}); 3082 Constant *Zero32 = ConstantInt::get(I32Type, 0); 3083 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32); 3084 Value *LowerBound = 3085 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb"); 3086 Builder.CreateCondBr(MoreWork, Header, Exit); 3087 3088 // Change PHI-node in loop header to use outer cond rather than preheader, 3089 // and set IV to the LowerBound. 3090 Instruction *Phi = &Header->front(); 3091 auto *PI = cast<PHINode>(Phi); 3092 PI->setIncomingBlock(0, OuterCond); 3093 PI->setIncomingValue(0, LowerBound); 3094 3095 // Then set the pre-header to jump to the OuterCond 3096 Instruction *Term = PreHeader->getTerminator(); 3097 auto *Br = cast<BranchInst>(Term); 3098 Br->setSuccessor(0, OuterCond); 3099 3100 // Modify the inner condition: 3101 // * Use the UpperBound returned from the DynamicNext call. 3102 // * jump to the loop outer loop when done with one of the inner loops. 3103 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt()); 3104 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub"); 3105 Instruction *Comp = &*Builder.GetInsertPoint(); 3106 auto *CI = cast<CmpInst>(Comp); 3107 CI->setOperand(1, UpperBound); 3108 // Redirect the inner exit to branch to outer condition. 3109 Instruction *Branch = &Cond->back(); 3110 auto *BI = cast<BranchInst>(Branch); 3111 assert(BI->getSuccessor(1) == Exit); 3112 BI->setSuccessor(1, OuterCond); 3113 3114 // Call the "fini" function if "ordered" is present in wsloop directive. 3115 if (Ordered) { 3116 Builder.SetInsertPoint(&Latch->back()); 3117 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this); 3118 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum}); 3119 } 3120 3121 // Add the barrier if requested. 3122 if (NeedsBarrier) { 3123 Builder.SetInsertPoint(&Exit->back()); 3124 createBarrier(LocationDescription(Builder.saveIP(), DL), 3125 omp::Directive::OMPD_for, /* ForceSimpleCall */ false, 3126 /* CheckCancelFlag */ false); 3127 } 3128 3129 CLI->invalidate(); 3130 return AfterIP; 3131 } 3132 3133 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is, 3134 /// after this \p OldTarget will be orphaned. 3135 static void redirectAllPredecessorsTo(BasicBlock *OldTarget, 3136 BasicBlock *NewTarget, DebugLoc DL) { 3137 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget))) 3138 redirectTo(Pred, NewTarget, DL); 3139 } 3140 3141 /// Determine which blocks in \p BBs are reachable from outside and remove the 3142 /// ones that are not reachable from the function. 3143 static void removeUnusedBlocksFromParent(ArrayRef<BasicBlock *> BBs) { 3144 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()}; 3145 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) { 3146 for (Use &U : BB->uses()) { 3147 auto *UseInst = dyn_cast<Instruction>(U.getUser()); 3148 if (!UseInst) 3149 continue; 3150 if (BBsToErase.count(UseInst->getParent())) 3151 continue; 3152 return true; 3153 } 3154 return false; 3155 }; 3156 3157 while (true) { 3158 bool Changed = false; 3159 for (BasicBlock *BB : make_early_inc_range(BBsToErase)) { 3160 if (HasRemainingUses(BB)) { 3161 BBsToErase.erase(BB); 3162 Changed = true; 3163 } 3164 } 3165 if (!Changed) 3166 break; 3167 } 3168 3169 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end()); 3170 DeleteDeadBlocks(BBVec); 3171 } 3172 3173 CanonicalLoopInfo * 3174 OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 3175 InsertPointTy ComputeIP) { 3176 assert(Loops.size() >= 1 && "At least one loop required"); 3177 size_t NumLoops = Loops.size(); 3178 3179 // Nothing to do if there is already just one loop. 3180 if (NumLoops == 1) 3181 return Loops.front(); 3182 3183 CanonicalLoopInfo *Outermost = Loops.front(); 3184 CanonicalLoopInfo *Innermost = Loops.back(); 3185 BasicBlock *OrigPreheader = Outermost->getPreheader(); 3186 BasicBlock *OrigAfter = Outermost->getAfter(); 3187 Function *F = OrigPreheader->getParent(); 3188 3189 // Loop control blocks that may become orphaned later. 3190 SmallVector<BasicBlock *, 12> OldControlBBs; 3191 OldControlBBs.reserve(6 * Loops.size()); 3192 for (CanonicalLoopInfo *Loop : Loops) 3193 Loop->collectControlBlocks(OldControlBBs); 3194 3195 // Setup the IRBuilder for inserting the trip count computation. 3196 Builder.SetCurrentDebugLocation(DL); 3197 if (ComputeIP.isSet()) 3198 Builder.restoreIP(ComputeIP); 3199 else 3200 Builder.restoreIP(Outermost->getPreheaderIP()); 3201 3202 // Derive the collapsed' loop trip count. 3203 // TODO: Find common/largest indvar type. 3204 Value *CollapsedTripCount = nullptr; 3205 for (CanonicalLoopInfo *L : Loops) { 3206 assert(L->isValid() && 3207 "All loops to collapse must be valid canonical loops"); 3208 Value *OrigTripCount = L->getTripCount(); 3209 if (!CollapsedTripCount) { 3210 CollapsedTripCount = OrigTripCount; 3211 continue; 3212 } 3213 3214 // TODO: Enable UndefinedSanitizer to diagnose an overflow here. 3215 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount, 3216 {}, /*HasNUW=*/true); 3217 } 3218 3219 // Create the collapsed loop control flow. 3220 CanonicalLoopInfo *Result = 3221 createLoopSkeleton(DL, CollapsedTripCount, F, 3222 OrigPreheader->getNextNode(), OrigAfter, "collapsed"); 3223 3224 // Build the collapsed loop body code. 3225 // Start with deriving the input loop induction variables from the collapsed 3226 // one, using a divmod scheme. To preserve the original loops' order, the 3227 // innermost loop use the least significant bits. 3228 Builder.restoreIP(Result->getBodyIP()); 3229 3230 Value *Leftover = Result->getIndVar(); 3231 SmallVector<Value *> NewIndVars; 3232 NewIndVars.resize(NumLoops); 3233 for (int i = NumLoops - 1; i >= 1; --i) { 3234 Value *OrigTripCount = Loops[i]->getTripCount(); 3235 3236 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount); 3237 NewIndVars[i] = NewIndVar; 3238 3239 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount); 3240 } 3241 // Outermost loop gets all the remaining bits. 3242 NewIndVars[0] = Leftover; 3243 3244 // Construct the loop body control flow. 3245 // We progressively construct the branch structure following in direction of 3246 // the control flow, from the leading in-between code, the loop nest body, the 3247 // trailing in-between code, and rejoining the collapsed loop's latch. 3248 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If 3249 // the ContinueBlock is set, continue with that block. If ContinuePred, use 3250 // its predecessors as sources. 3251 BasicBlock *ContinueBlock = Result->getBody(); 3252 BasicBlock *ContinuePred = nullptr; 3253 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest, 3254 BasicBlock *NextSrc) { 3255 if (ContinueBlock) 3256 redirectTo(ContinueBlock, Dest, DL); 3257 else 3258 redirectAllPredecessorsTo(ContinuePred, Dest, DL); 3259 3260 ContinueBlock = nullptr; 3261 ContinuePred = NextSrc; 3262 }; 3263 3264 // The code before the nested loop of each level. 3265 // Because we are sinking it into the nest, it will be executed more often 3266 // that the original loop. More sophisticated schemes could keep track of what 3267 // the in-between code is and instantiate it only once per thread. 3268 for (size_t i = 0; i < NumLoops - 1; ++i) 3269 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader()); 3270 3271 // Connect the loop nest body. 3272 ContinueWith(Innermost->getBody(), Innermost->getLatch()); 3273 3274 // The code after the nested loop at each level. 3275 for (size_t i = NumLoops - 1; i > 0; --i) 3276 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch()); 3277 3278 // Connect the finished loop to the collapsed loop latch. 3279 ContinueWith(Result->getLatch(), nullptr); 3280 3281 // Replace the input loops with the new collapsed loop. 3282 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL); 3283 redirectTo(Result->getAfter(), Outermost->getAfter(), DL); 3284 3285 // Replace the input loop indvars with the derived ones. 3286 for (size_t i = 0; i < NumLoops; ++i) 3287 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); 3288 3289 // Remove unused parts of the input loops. 3290 removeUnusedBlocksFromParent(OldControlBBs); 3291 3292 for (CanonicalLoopInfo *L : Loops) 3293 L->invalidate(); 3294 3295 #ifndef NDEBUG 3296 Result->assertOK(); 3297 #endif 3298 return Result; 3299 } 3300 3301 std::vector<CanonicalLoopInfo *> 3302 OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, 3303 ArrayRef<Value *> TileSizes) { 3304 assert(TileSizes.size() == Loops.size() && 3305 "Must pass as many tile sizes as there are loops"); 3306 int NumLoops = Loops.size(); 3307 assert(NumLoops >= 1 && "At least one loop to tile required"); 3308 3309 CanonicalLoopInfo *OutermostLoop = Loops.front(); 3310 CanonicalLoopInfo *InnermostLoop = Loops.back(); 3311 Function *F = OutermostLoop->getBody()->getParent(); 3312 BasicBlock *InnerEnter = InnermostLoop->getBody(); 3313 BasicBlock *InnerLatch = InnermostLoop->getLatch(); 3314 3315 // Loop control blocks that may become orphaned later. 3316 SmallVector<BasicBlock *, 12> OldControlBBs; 3317 OldControlBBs.reserve(6 * Loops.size()); 3318 for (CanonicalLoopInfo *Loop : Loops) 3319 Loop->collectControlBlocks(OldControlBBs); 3320 3321 // Collect original trip counts and induction variable to be accessible by 3322 // index. Also, the structure of the original loops is not preserved during 3323 // the construction of the tiled loops, so do it before we scavenge the BBs of 3324 // any original CanonicalLoopInfo. 3325 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars; 3326 for (CanonicalLoopInfo *L : Loops) { 3327 assert(L->isValid() && "All input loops must be valid canonical loops"); 3328 OrigTripCounts.push_back(L->getTripCount()); 3329 OrigIndVars.push_back(L->getIndVar()); 3330 } 3331 3332 // Collect the code between loop headers. These may contain SSA definitions 3333 // that are used in the loop nest body. To be usable with in the innermost 3334 // body, these BasicBlocks will be sunk into the loop nest body. That is, 3335 // these instructions may be executed more often than before the tiling. 3336 // TODO: It would be sufficient to only sink them into body of the 3337 // corresponding tile loop. 3338 SmallVector<std::pair<BasicBlock *, BasicBlock *>, 4> InbetweenCode; 3339 for (int i = 0; i < NumLoops - 1; ++i) { 3340 CanonicalLoopInfo *Surrounding = Loops[i]; 3341 CanonicalLoopInfo *Nested = Loops[i + 1]; 3342 3343 BasicBlock *EnterBB = Surrounding->getBody(); 3344 BasicBlock *ExitBB = Nested->getHeader(); 3345 InbetweenCode.emplace_back(EnterBB, ExitBB); 3346 } 3347 3348 // Compute the trip counts of the floor loops. 3349 Builder.SetCurrentDebugLocation(DL); 3350 Builder.restoreIP(OutermostLoop->getPreheaderIP()); 3351 SmallVector<Value *, 4> FloorCount, FloorRems; 3352 for (int i = 0; i < NumLoops; ++i) { 3353 Value *TileSize = TileSizes[i]; 3354 Value *OrigTripCount = OrigTripCounts[i]; 3355 Type *IVType = OrigTripCount->getType(); 3356 3357 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize); 3358 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize); 3359 3360 // 0 if tripcount divides the tilesize, 1 otherwise. 3361 // 1 means we need an additional iteration for a partial tile. 3362 // 3363 // Unfortunately we cannot just use the roundup-formula 3364 // (tripcount + tilesize - 1)/tilesize 3365 // because the summation might overflow. We do not want introduce undefined 3366 // behavior when the untiled loop nest did not. 3367 Value *FloorTripOverflow = 3368 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0)); 3369 3370 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType); 3371 FloorTripCount = 3372 Builder.CreateAdd(FloorTripCount, FloorTripOverflow, 3373 "omp_floor" + Twine(i) + ".tripcount", true); 3374 3375 // Remember some values for later use. 3376 FloorCount.push_back(FloorTripCount); 3377 FloorRems.push_back(FloorTripRem); 3378 } 3379 3380 // Generate the new loop nest, from the outermost to the innermost. 3381 std::vector<CanonicalLoopInfo *> Result; 3382 Result.reserve(NumLoops * 2); 3383 3384 // The basic block of the surrounding loop that enters the nest generated 3385 // loop. 3386 BasicBlock *Enter = OutermostLoop->getPreheader(); 3387 3388 // The basic block of the surrounding loop where the inner code should 3389 // continue. 3390 BasicBlock *Continue = OutermostLoop->getAfter(); 3391 3392 // Where the next loop basic block should be inserted. 3393 BasicBlock *OutroInsertBefore = InnermostLoop->getExit(); 3394 3395 auto EmbeddNewLoop = 3396 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore]( 3397 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * { 3398 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton( 3399 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name); 3400 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL); 3401 redirectTo(EmbeddedLoop->getAfter(), Continue, DL); 3402 3403 // Setup the position where the next embedded loop connects to this loop. 3404 Enter = EmbeddedLoop->getBody(); 3405 Continue = EmbeddedLoop->getLatch(); 3406 OutroInsertBefore = EmbeddedLoop->getLatch(); 3407 return EmbeddedLoop; 3408 }; 3409 3410 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts, 3411 const Twine &NameBase) { 3412 for (auto P : enumerate(TripCounts)) { 3413 CanonicalLoopInfo *EmbeddedLoop = 3414 EmbeddNewLoop(P.value(), NameBase + Twine(P.index())); 3415 Result.push_back(EmbeddedLoop); 3416 } 3417 }; 3418 3419 EmbeddNewLoops(FloorCount, "floor"); 3420 3421 // Within the innermost floor loop, emit the code that computes the tile 3422 // sizes. 3423 Builder.SetInsertPoint(Enter->getTerminator()); 3424 SmallVector<Value *, 4> TileCounts; 3425 for (int i = 0; i < NumLoops; ++i) { 3426 CanonicalLoopInfo *FloorLoop = Result[i]; 3427 Value *TileSize = TileSizes[i]; 3428 3429 Value *FloorIsEpilogue = 3430 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]); 3431 Value *TileTripCount = 3432 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize); 3433 3434 TileCounts.push_back(TileTripCount); 3435 } 3436 3437 // Create the tile loops. 3438 EmbeddNewLoops(TileCounts, "tile"); 3439 3440 // Insert the inbetween code into the body. 3441 BasicBlock *BodyEnter = Enter; 3442 BasicBlock *BodyEntered = nullptr; 3443 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) { 3444 BasicBlock *EnterBB = P.first; 3445 BasicBlock *ExitBB = P.second; 3446 3447 if (BodyEnter) 3448 redirectTo(BodyEnter, EnterBB, DL); 3449 else 3450 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL); 3451 3452 BodyEnter = nullptr; 3453 BodyEntered = ExitBB; 3454 } 3455 3456 // Append the original loop nest body into the generated loop nest body. 3457 if (BodyEnter) 3458 redirectTo(BodyEnter, InnerEnter, DL); 3459 else 3460 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL); 3461 redirectAllPredecessorsTo(InnerLatch, Continue, DL); 3462 3463 // Replace the original induction variable with an induction variable computed 3464 // from the tile and floor induction variables. 3465 Builder.restoreIP(Result.back()->getBodyIP()); 3466 for (int i = 0; i < NumLoops; ++i) { 3467 CanonicalLoopInfo *FloorLoop = Result[i]; 3468 CanonicalLoopInfo *TileLoop = Result[NumLoops + i]; 3469 Value *OrigIndVar = OrigIndVars[i]; 3470 Value *Size = TileSizes[i]; 3471 3472 Value *Scale = 3473 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true); 3474 Value *Shift = 3475 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true); 3476 OrigIndVar->replaceAllUsesWith(Shift); 3477 } 3478 3479 // Remove unused parts of the original loops. 3480 removeUnusedBlocksFromParent(OldControlBBs); 3481 3482 for (CanonicalLoopInfo *L : Loops) 3483 L->invalidate(); 3484 3485 #ifndef NDEBUG 3486 for (CanonicalLoopInfo *GenL : Result) 3487 GenL->assertOK(); 3488 #endif 3489 return Result; 3490 } 3491 3492 /// Attach metadata \p Properties to the basic block described by \p BB. If the 3493 /// basic block already has metadata, the basic block properties are appended. 3494 static void addBasicBlockMetadata(BasicBlock *BB, 3495 ArrayRef<Metadata *> Properties) { 3496 // Nothing to do if no property to attach. 3497 if (Properties.empty()) 3498 return; 3499 3500 LLVMContext &Ctx = BB->getContext(); 3501 SmallVector<Metadata *> NewProperties; 3502 NewProperties.push_back(nullptr); 3503 3504 // If the basic block already has metadata, prepend it to the new metadata. 3505 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop); 3506 if (Existing) 3507 append_range(NewProperties, drop_begin(Existing->operands(), 1)); 3508 3509 append_range(NewProperties, Properties); 3510 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties); 3511 BasicBlockID->replaceOperandWith(0, BasicBlockID); 3512 3513 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID); 3514 } 3515 3516 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the 3517 /// loop already has metadata, the loop properties are appended. 3518 static void addLoopMetadata(CanonicalLoopInfo *Loop, 3519 ArrayRef<Metadata *> Properties) { 3520 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo"); 3521 3522 // Attach metadata to the loop's latch 3523 BasicBlock *Latch = Loop->getLatch(); 3524 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch"); 3525 addBasicBlockMetadata(Latch, Properties); 3526 } 3527 3528 /// Attach llvm.access.group metadata to the memref instructions of \p Block 3529 static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, 3530 LoopInfo &LI) { 3531 for (Instruction &I : *Block) { 3532 if (I.mayReadOrWriteMemory()) { 3533 // TODO: This instruction may already have access group from 3534 // other pragmas e.g. #pragma clang loop vectorize. Append 3535 // so that the existing metadata is not overwritten. 3536 I.setMetadata(LLVMContext::MD_access_group, AccessGroup); 3537 } 3538 } 3539 } 3540 3541 void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) { 3542 LLVMContext &Ctx = Builder.getContext(); 3543 addLoopMetadata( 3544 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 3545 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))}); 3546 } 3547 3548 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) { 3549 LLVMContext &Ctx = Builder.getContext(); 3550 addLoopMetadata( 3551 Loop, { 3552 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 3553 }); 3554 } 3555 3556 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop, 3557 Value *IfCond, ValueToValueMapTy &VMap, 3558 const Twine &NamePrefix) { 3559 Function *F = CanonicalLoop->getFunction(); 3560 3561 // Define where if branch should be inserted 3562 Instruction *SplitBefore; 3563 if (Instruction::classof(IfCond)) { 3564 SplitBefore = dyn_cast<Instruction>(IfCond); 3565 } else { 3566 SplitBefore = CanonicalLoop->getPreheader()->getTerminator(); 3567 } 3568 3569 // TODO: We should not rely on pass manager. Currently we use pass manager 3570 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo 3571 // object. We should have a method which returns all blocks between 3572 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter() 3573 FunctionAnalysisManager FAM; 3574 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 3575 FAM.registerPass([]() { return LoopAnalysis(); }); 3576 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 3577 3578 // Get the loop which needs to be cloned 3579 LoopAnalysis LIA; 3580 LoopInfo &&LI = LIA.run(*F, FAM); 3581 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); 3582 3583 // Create additional blocks for the if statement 3584 BasicBlock *Head = SplitBefore->getParent(); 3585 Instruction *HeadOldTerm = Head->getTerminator(); 3586 llvm::LLVMContext &C = Head->getContext(); 3587 llvm::BasicBlock *ThenBlock = llvm::BasicBlock::Create( 3588 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode()); 3589 llvm::BasicBlock *ElseBlock = llvm::BasicBlock::Create( 3590 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit()); 3591 3592 // Create if condition branch. 3593 Builder.SetInsertPoint(HeadOldTerm); 3594 Instruction *BrInstr = 3595 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock); 3596 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()}; 3597 // Then block contains branch to omp loop which needs to be vectorized 3598 spliceBB(IP, ThenBlock, false); 3599 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock); 3600 3601 Builder.SetInsertPoint(ElseBlock); 3602 3603 // Clone loop for the else branch 3604 SmallVector<BasicBlock *, 8> NewBlocks; 3605 3606 VMap[CanonicalLoop->getPreheader()] = ElseBlock; 3607 for (BasicBlock *Block : L->getBlocks()) { 3608 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F); 3609 NewBB->moveBefore(CanonicalLoop->getExit()); 3610 VMap[Block] = NewBB; 3611 NewBlocks.push_back(NewBB); 3612 } 3613 remapInstructionsInBlocks(NewBlocks, VMap); 3614 Builder.CreateBr(NewBlocks.front()); 3615 } 3616 3617 unsigned 3618 OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple, 3619 const StringMap<bool> &Features) { 3620 if (TargetTriple.isX86()) { 3621 if (Features.lookup("avx512f")) 3622 return 512; 3623 else if (Features.lookup("avx")) 3624 return 256; 3625 return 128; 3626 } 3627 if (TargetTriple.isPPC()) 3628 return 128; 3629 if (TargetTriple.isWasm()) 3630 return 128; 3631 return 0; 3632 } 3633 3634 void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, 3635 MapVector<Value *, Value *> AlignedVars, 3636 Value *IfCond, OrderKind Order, 3637 ConstantInt *Simdlen, ConstantInt *Safelen) { 3638 LLVMContext &Ctx = Builder.getContext(); 3639 3640 Function *F = CanonicalLoop->getFunction(); 3641 3642 // TODO: We should not rely on pass manager. Currently we use pass manager 3643 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo 3644 // object. We should have a method which returns all blocks between 3645 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter() 3646 FunctionAnalysisManager FAM; 3647 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 3648 FAM.registerPass([]() { return LoopAnalysis(); }); 3649 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 3650 3651 LoopAnalysis LIA; 3652 LoopInfo &&LI = LIA.run(*F, FAM); 3653 3654 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader()); 3655 if (AlignedVars.size()) { 3656 InsertPointTy IP = Builder.saveIP(); 3657 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator()); 3658 for (auto &AlignedItem : AlignedVars) { 3659 Value *AlignedPtr = AlignedItem.first; 3660 Value *Alignment = AlignedItem.second; 3661 Builder.CreateAlignmentAssumption(F->getParent()->getDataLayout(), 3662 AlignedPtr, Alignment); 3663 } 3664 Builder.restoreIP(IP); 3665 } 3666 3667 if (IfCond) { 3668 ValueToValueMapTy VMap; 3669 createIfVersion(CanonicalLoop, IfCond, VMap, "simd"); 3670 // Add metadata to the cloned loop which disables vectorization 3671 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch()); 3672 assert(MappedLatch && 3673 "Cannot find value which corresponds to original loop latch"); 3674 assert(isa<BasicBlock>(MappedLatch) && 3675 "Cannot cast mapped latch block value to BasicBlock"); 3676 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch); 3677 ConstantAsMetadata *BoolConst = 3678 ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx))); 3679 addBasicBlockMetadata( 3680 NewLatchBlock, 3681 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), 3682 BoolConst})}); 3683 } 3684 3685 SmallSet<BasicBlock *, 8> Reachable; 3686 3687 // Get the basic blocks from the loop in which memref instructions 3688 // can be found. 3689 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo, 3690 // preferably without running any passes. 3691 for (BasicBlock *Block : L->getBlocks()) { 3692 if (Block == CanonicalLoop->getCond() || 3693 Block == CanonicalLoop->getHeader()) 3694 continue; 3695 Reachable.insert(Block); 3696 } 3697 3698 SmallVector<Metadata *> LoopMDList; 3699 3700 // In presence of finite 'safelen', it may be unsafe to mark all 3701 // the memory instructions parallel, because loop-carried 3702 // dependences of 'safelen' iterations are possible. 3703 // If clause order(concurrent) is specified then the memory instructions 3704 // are marked parallel even if 'safelen' is finite. 3705 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) { 3706 // Add access group metadata to memory-access instructions. 3707 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {}); 3708 for (BasicBlock *BB : Reachable) 3709 addSimdMetadata(BB, AccessGroup, LI); 3710 // TODO: If the loop has existing parallel access metadata, have 3711 // to combine two lists. 3712 LoopMDList.push_back(MDNode::get( 3713 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup})); 3714 } 3715 3716 // Use the above access group metadata to create loop level 3717 // metadata, which should be distinct for each loop. 3718 ConstantAsMetadata *BoolConst = 3719 ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx))); 3720 LoopMDList.push_back(MDNode::get( 3721 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst})); 3722 3723 if (Simdlen || Safelen) { 3724 // If both simdlen and safelen clauses are specified, the value of the 3725 // simdlen parameter must be less than or equal to the value of the safelen 3726 // parameter. Therefore, use safelen only in the absence of simdlen. 3727 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen; 3728 LoopMDList.push_back( 3729 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"), 3730 ConstantAsMetadata::get(VectorizeWidth)})); 3731 } 3732 3733 addLoopMetadata(CanonicalLoop, LoopMDList); 3734 } 3735 3736 /// Create the TargetMachine object to query the backend for optimization 3737 /// preferences. 3738 /// 3739 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but 3740 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when 3741 /// needed for the LLVM pass pipline. We use some default options to avoid 3742 /// having to pass too many settings from the frontend that probably do not 3743 /// matter. 3744 /// 3745 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial 3746 /// method. If we are going to use TargetMachine for more purposes, especially 3747 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it 3748 /// might become be worth requiring front-ends to pass on their TargetMachine, 3749 /// or at least cache it between methods. Note that while fontends such as Clang 3750 /// have just a single main TargetMachine per translation unit, "target-cpu" and 3751 /// "target-features" that determine the TargetMachine are per-function and can 3752 /// be overrided using __attribute__((target("OPTIONS"))). 3753 static std::unique_ptr<TargetMachine> 3754 createTargetMachine(Function *F, CodeGenOptLevel OptLevel) { 3755 Module *M = F->getParent(); 3756 3757 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString(); 3758 StringRef Features = F->getFnAttribute("target-features").getValueAsString(); 3759 const std::string &Triple = M->getTargetTriple(); 3760 3761 std::string Error; 3762 const llvm::Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error); 3763 if (!TheTarget) 3764 return {}; 3765 3766 llvm::TargetOptions Options; 3767 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine( 3768 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt, 3769 /*CodeModel=*/std::nullopt, OptLevel)); 3770 } 3771 3772 /// Heuristically determine the best-performant unroll factor for \p CLI. This 3773 /// depends on the target processor. We are re-using the same heuristics as the 3774 /// LoopUnrollPass. 3775 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { 3776 Function *F = CLI->getFunction(); 3777 3778 // Assume the user requests the most aggressive unrolling, even if the rest of 3779 // the code is optimized using a lower setting. 3780 CodeGenOptLevel OptLevel = CodeGenOptLevel::Aggressive; 3781 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel); 3782 3783 FunctionAnalysisManager FAM; 3784 FAM.registerPass([]() { return TargetLibraryAnalysis(); }); 3785 FAM.registerPass([]() { return AssumptionAnalysis(); }); 3786 FAM.registerPass([]() { return DominatorTreeAnalysis(); }); 3787 FAM.registerPass([]() { return LoopAnalysis(); }); 3788 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); }); 3789 FAM.registerPass([]() { return PassInstrumentationAnalysis(); }); 3790 TargetIRAnalysis TIRA; 3791 if (TM) 3792 TIRA = TargetIRAnalysis( 3793 [&](const Function &F) { return TM->getTargetTransformInfo(F); }); 3794 FAM.registerPass([&]() { return TIRA; }); 3795 3796 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM); 3797 ScalarEvolutionAnalysis SEA; 3798 ScalarEvolution &&SE = SEA.run(*F, FAM); 3799 DominatorTreeAnalysis DTA; 3800 DominatorTree &&DT = DTA.run(*F, FAM); 3801 LoopAnalysis LIA; 3802 LoopInfo &&LI = LIA.run(*F, FAM); 3803 AssumptionAnalysis ACT; 3804 AssumptionCache &&AC = ACT.run(*F, FAM); 3805 OptimizationRemarkEmitter ORE{F}; 3806 3807 Loop *L = LI.getLoopFor(CLI->getHeader()); 3808 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop"); 3809 3810 TargetTransformInfo::UnrollingPreferences UP = 3811 gatherUnrollingPreferences(L, SE, TTI, 3812 /*BlockFrequencyInfo=*/nullptr, 3813 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel), 3814 /*UserThreshold=*/std::nullopt, 3815 /*UserCount=*/std::nullopt, 3816 /*UserAllowPartial=*/true, 3817 /*UserAllowRuntime=*/true, 3818 /*UserUpperBound=*/std::nullopt, 3819 /*UserFullUnrollMaxCount=*/std::nullopt); 3820 3821 UP.Force = true; 3822 3823 // Account for additional optimizations taking place before the LoopUnrollPass 3824 // would unroll the loop. 3825 UP.Threshold *= UnrollThresholdFactor; 3826 UP.PartialThreshold *= UnrollThresholdFactor; 3827 3828 // Use normal unroll factors even if the rest of the code is optimized for 3829 // size. 3830 UP.OptSizeThreshold = UP.Threshold; 3831 UP.PartialOptSizeThreshold = UP.PartialThreshold; 3832 3833 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n" 3834 << " Threshold=" << UP.Threshold << "\n" 3835 << " PartialThreshold=" << UP.PartialThreshold << "\n" 3836 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n" 3837 << " PartialOptSizeThreshold=" 3838 << UP.PartialOptSizeThreshold << "\n"); 3839 3840 // Disable peeling. 3841 TargetTransformInfo::PeelingPreferences PP = 3842 gatherPeelingPreferences(L, SE, TTI, 3843 /*UserAllowPeeling=*/false, 3844 /*UserAllowProfileBasedPeeling=*/false, 3845 /*UnrollingSpecficValues=*/false); 3846 3847 SmallPtrSet<const Value *, 32> EphValues; 3848 CodeMetrics::collectEphemeralValues(L, &AC, EphValues); 3849 3850 // Assume that reads and writes to stack variables can be eliminated by 3851 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's 3852 // size. 3853 for (BasicBlock *BB : L->blocks()) { 3854 for (Instruction &I : *BB) { 3855 Value *Ptr; 3856 if (auto *Load = dyn_cast<LoadInst>(&I)) { 3857 Ptr = Load->getPointerOperand(); 3858 } else if (auto *Store = dyn_cast<StoreInst>(&I)) { 3859 Ptr = Store->getPointerOperand(); 3860 } else 3861 continue; 3862 3863 Ptr = Ptr->stripPointerCasts(); 3864 3865 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) { 3866 if (Alloca->getParent() == &F->getEntryBlock()) 3867 EphValues.insert(&I); 3868 } 3869 } 3870 } 3871 3872 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); 3873 3874 // Loop is not unrollable if the loop contains certain instructions. 3875 if (!UCE.canUnroll() || UCE.Convergent) { 3876 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); 3877 return 1; 3878 } 3879 3880 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize() 3881 << "\n"); 3882 3883 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might 3884 // be able to use it. 3885 int TripCount = 0; 3886 int MaxTripCount = 0; 3887 bool MaxOrZero = false; 3888 unsigned TripMultiple = 0; 3889 3890 bool UseUpperBound = false; 3891 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount, 3892 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP, 3893 UseUpperBound); 3894 unsigned Factor = UP.Count; 3895 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n"); 3896 3897 // This function returns 1 to signal to not unroll a loop. 3898 if (Factor == 0) 3899 return 1; 3900 return Factor; 3901 } 3902 3903 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, 3904 int32_t Factor, 3905 CanonicalLoopInfo **UnrolledCLI) { 3906 assert(Factor >= 0 && "Unroll factor must not be negative"); 3907 3908 Function *F = Loop->getFunction(); 3909 LLVMContext &Ctx = F->getContext(); 3910 3911 // If the unrolled loop is not used for another loop-associated directive, it 3912 // is sufficient to add metadata for the LoopUnrollPass. 3913 if (!UnrolledCLI) { 3914 SmallVector<Metadata *, 2> LoopMetadata; 3915 LoopMetadata.push_back( 3916 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable"))); 3917 3918 if (Factor >= 1) { 3919 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 3920 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 3921 LoopMetadata.push_back(MDNode::get( 3922 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})); 3923 } 3924 3925 addLoopMetadata(Loop, LoopMetadata); 3926 return; 3927 } 3928 3929 // Heuristically determine the unroll factor. 3930 if (Factor == 0) 3931 Factor = computeHeuristicUnrollFactor(Loop); 3932 3933 // No change required with unroll factor 1. 3934 if (Factor == 1) { 3935 *UnrolledCLI = Loop; 3936 return; 3937 } 3938 3939 assert(Factor >= 2 && 3940 "unrolling only makes sense with a factor of 2 or larger"); 3941 3942 Type *IndVarTy = Loop->getIndVarType(); 3943 3944 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully 3945 // unroll the inner loop. 3946 Value *FactorVal = 3947 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor, 3948 /*isSigned=*/false)); 3949 std::vector<CanonicalLoopInfo *> LoopNest = 3950 tileLoops(DL, {Loop}, {FactorVal}); 3951 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling"); 3952 *UnrolledCLI = LoopNest[0]; 3953 CanonicalLoopInfo *InnerLoop = LoopNest[1]; 3954 3955 // LoopUnrollPass can only fully unroll loops with constant trip count. 3956 // Unroll by the unroll factor with a fallback epilog for the remainder 3957 // iterations if necessary. 3958 ConstantAsMetadata *FactorConst = ConstantAsMetadata::get( 3959 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor))); 3960 addLoopMetadata( 3961 InnerLoop, 3962 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")), 3963 MDNode::get( 3964 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})}); 3965 3966 #ifndef NDEBUG 3967 (*UnrolledCLI)->assertOK(); 3968 #endif 3969 } 3970 3971 OpenMPIRBuilder::InsertPointTy 3972 OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc, 3973 llvm::Value *BufSize, llvm::Value *CpyBuf, 3974 llvm::Value *CpyFn, llvm::Value *DidIt) { 3975 if (!updateToLocation(Loc)) 3976 return Loc.IP; 3977 3978 uint32_t SrcLocStrSize; 3979 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 3980 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 3981 Value *ThreadId = getOrCreateThreadID(Ident); 3982 3983 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt); 3984 3985 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD}; 3986 3987 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate); 3988 Builder.CreateCall(Fn, Args); 3989 3990 return Builder.saveIP(); 3991 } 3992 3993 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle( 3994 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 3995 FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) { 3996 3997 if (!updateToLocation(Loc)) 3998 return Loc.IP; 3999 4000 // If needed (i.e. not null), initialize `DidIt` with 0 4001 if (DidIt) { 4002 Builder.CreateStore(Builder.getInt32(0), DidIt); 4003 } 4004 4005 Directive OMPD = Directive::OMPD_single; 4006 uint32_t SrcLocStrSize; 4007 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4008 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4009 Value *ThreadId = getOrCreateThreadID(Ident); 4010 Value *Args[] = {Ident, ThreadId}; 4011 4012 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single); 4013 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args); 4014 4015 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single); 4016 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 4017 4018 // generates the following: 4019 // if (__kmpc_single()) { 4020 // .... single region ... 4021 // __kmpc_end_single 4022 // } 4023 // __kmpc_barrier 4024 4025 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 4026 /*Conditional*/ true, 4027 /*hasFinalize*/ true); 4028 if (!IsNowait) 4029 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), 4030 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false, 4031 /* CheckCancelFlag */ false); 4032 return Builder.saveIP(); 4033 } 4034 4035 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical( 4036 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 4037 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) { 4038 4039 if (!updateToLocation(Loc)) 4040 return Loc.IP; 4041 4042 Directive OMPD = Directive::OMPD_critical; 4043 uint32_t SrcLocStrSize; 4044 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4045 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4046 Value *ThreadId = getOrCreateThreadID(Ident); 4047 Value *LockVar = getOMPCriticalRegionLock(CriticalName); 4048 Value *Args[] = {Ident, ThreadId, LockVar}; 4049 4050 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args)); 4051 Function *RTFn = nullptr; 4052 if (HintInst) { 4053 // Add Hint to entry Args and create call 4054 EnterArgs.push_back(HintInst); 4055 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint); 4056 } else { 4057 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical); 4058 } 4059 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs); 4060 4061 Function *ExitRTLFn = 4062 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical); 4063 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args); 4064 4065 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 4066 /*Conditional*/ false, /*hasFinalize*/ true); 4067 } 4068 4069 OpenMPIRBuilder::InsertPointTy 4070 OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc, 4071 InsertPointTy AllocaIP, unsigned NumLoops, 4072 ArrayRef<llvm::Value *> StoreValues, 4073 const Twine &Name, bool IsDependSource) { 4074 assert( 4075 llvm::all_of(StoreValues, 4076 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) && 4077 "OpenMP runtime requires depend vec with i64 type"); 4078 4079 if (!updateToLocation(Loc)) 4080 return Loc.IP; 4081 4082 // Allocate space for vector and generate alloc instruction. 4083 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops); 4084 Builder.restoreIP(AllocaIP); 4085 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name); 4086 ArgsBase->setAlignment(Align(8)); 4087 Builder.restoreIP(Loc.IP); 4088 4089 // Store the index value with offset in depend vector. 4090 for (unsigned I = 0; I < NumLoops; ++I) { 4091 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP( 4092 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)}); 4093 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter); 4094 STInst->setAlignment(Align(8)); 4095 } 4096 4097 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP( 4098 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)}); 4099 4100 uint32_t SrcLocStrSize; 4101 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4102 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4103 Value *ThreadId = getOrCreateThreadID(Ident); 4104 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP}; 4105 4106 Function *RTLFn = nullptr; 4107 if (IsDependSource) 4108 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post); 4109 else 4110 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait); 4111 Builder.CreateCall(RTLFn, Args); 4112 4113 return Builder.saveIP(); 4114 } 4115 4116 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd( 4117 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, 4118 FinalizeCallbackTy FiniCB, bool IsThreads) { 4119 if (!updateToLocation(Loc)) 4120 return Loc.IP; 4121 4122 Directive OMPD = Directive::OMPD_ordered; 4123 Instruction *EntryCall = nullptr; 4124 Instruction *ExitCall = nullptr; 4125 4126 if (IsThreads) { 4127 uint32_t SrcLocStrSize; 4128 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4129 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4130 Value *ThreadId = getOrCreateThreadID(Ident); 4131 Value *Args[] = {Ident, ThreadId}; 4132 4133 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered); 4134 EntryCall = Builder.CreateCall(EntryRTLFn, Args); 4135 4136 Function *ExitRTLFn = 4137 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered); 4138 ExitCall = Builder.CreateCall(ExitRTLFn, Args); 4139 } 4140 4141 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB, 4142 /*Conditional*/ false, /*hasFinalize*/ true); 4143 } 4144 4145 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion( 4146 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall, 4147 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional, 4148 bool HasFinalize, bool IsCancellable) { 4149 4150 if (HasFinalize) 4151 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable}); 4152 4153 // Create inlined region's entry and body blocks, in preparation 4154 // for conditional creation 4155 BasicBlock *EntryBB = Builder.GetInsertBlock(); 4156 Instruction *SplitPos = EntryBB->getTerminator(); 4157 if (!isa_and_nonnull<BranchInst>(SplitPos)) 4158 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB); 4159 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end"); 4160 BasicBlock *FiniBB = 4161 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize"); 4162 4163 Builder.SetInsertPoint(EntryBB->getTerminator()); 4164 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional); 4165 4166 // generate body 4167 BodyGenCB(/* AllocaIP */ InsertPointTy(), 4168 /* CodeGenIP */ Builder.saveIP()); 4169 4170 // emit exit call and do any needed finalization. 4171 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt()); 4172 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 && 4173 FiniBB->getTerminator()->getSuccessor(0) == ExitBB && 4174 "Unexpected control flow graph state!!"); 4175 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize); 4176 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB && 4177 "Unexpected Control Flow State!"); 4178 MergeBlockIntoPredecessor(FiniBB); 4179 4180 // If we are skipping the region of a non conditional, remove the exit 4181 // block, and clear the builder's insertion point. 4182 assert(SplitPos->getParent() == ExitBB && 4183 "Unexpected Insertion point location!"); 4184 auto merged = MergeBlockIntoPredecessor(ExitBB); 4185 BasicBlock *ExitPredBB = SplitPos->getParent(); 4186 auto InsertBB = merged ? ExitPredBB : ExitBB; 4187 if (!isa_and_nonnull<BranchInst>(SplitPos)) 4188 SplitPos->eraseFromParent(); 4189 Builder.SetInsertPoint(InsertBB); 4190 4191 return Builder.saveIP(); 4192 } 4193 4194 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry( 4195 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) { 4196 // if nothing to do, Return current insertion point. 4197 if (!Conditional || !EntryCall) 4198 return Builder.saveIP(); 4199 4200 BasicBlock *EntryBB = Builder.GetInsertBlock(); 4201 Value *CallBool = Builder.CreateIsNotNull(EntryCall); 4202 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body"); 4203 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB); 4204 4205 // Emit thenBB and set the Builder's insertion point there for 4206 // body generation next. Place the block after the current block. 4207 Function *CurFn = EntryBB->getParent(); 4208 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB); 4209 4210 // Move Entry branch to end of ThenBB, and replace with conditional 4211 // branch (If-stmt) 4212 Instruction *EntryBBTI = EntryBB->getTerminator(); 4213 Builder.CreateCondBr(CallBool, ThenBB, ExitBB); 4214 EntryBBTI->removeFromParent(); 4215 Builder.SetInsertPoint(UI); 4216 Builder.Insert(EntryBBTI); 4217 UI->eraseFromParent(); 4218 Builder.SetInsertPoint(ThenBB->getTerminator()); 4219 4220 // return an insertion point to ExitBB. 4221 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt()); 4222 } 4223 4224 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit( 4225 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall, 4226 bool HasFinalize) { 4227 4228 Builder.restoreIP(FinIP); 4229 4230 // If there is finalization to do, emit it before the exit call 4231 if (HasFinalize) { 4232 assert(!FinalizationStack.empty() && 4233 "Unexpected finalization stack state!"); 4234 4235 FinalizationInfo Fi = FinalizationStack.pop_back_val(); 4236 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!"); 4237 4238 Fi.FiniCB(FinIP); 4239 4240 BasicBlock *FiniBB = FinIP.getBlock(); 4241 Instruction *FiniBBTI = FiniBB->getTerminator(); 4242 4243 // set Builder IP for call creation 4244 Builder.SetInsertPoint(FiniBBTI); 4245 } 4246 4247 if (!ExitCall) 4248 return Builder.saveIP(); 4249 4250 // place the Exitcall as last instruction before Finalization block terminator 4251 ExitCall->removeFromParent(); 4252 Builder.Insert(ExitCall); 4253 4254 return IRBuilder<>::InsertPoint(ExitCall->getParent(), 4255 ExitCall->getIterator()); 4256 } 4257 4258 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks( 4259 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, 4260 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) { 4261 if (!IP.isSet()) 4262 return IP; 4263 4264 IRBuilder<>::InsertPointGuard IPG(Builder); 4265 4266 // creates the following CFG structure 4267 // OMP_Entry : (MasterAddr != PrivateAddr)? 4268 // F T 4269 // | \ 4270 // | copin.not.master 4271 // | / 4272 // v / 4273 // copyin.not.master.end 4274 // | 4275 // v 4276 // OMP.Entry.Next 4277 4278 BasicBlock *OMP_Entry = IP.getBlock(); 4279 Function *CurFn = OMP_Entry->getParent(); 4280 BasicBlock *CopyBegin = 4281 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn); 4282 BasicBlock *CopyEnd = nullptr; 4283 4284 // If entry block is terminated, split to preserve the branch to following 4285 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is. 4286 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) { 4287 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(), 4288 "copyin.not.master.end"); 4289 OMP_Entry->getTerminator()->eraseFromParent(); 4290 } else { 4291 CopyEnd = 4292 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn); 4293 } 4294 4295 Builder.SetInsertPoint(OMP_Entry); 4296 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy); 4297 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy); 4298 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr); 4299 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd); 4300 4301 Builder.SetInsertPoint(CopyBegin); 4302 if (BranchtoEnd) 4303 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd)); 4304 4305 return Builder.saveIP(); 4306 } 4307 4308 CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc, 4309 Value *Size, Value *Allocator, 4310 std::string Name) { 4311 IRBuilder<>::InsertPointGuard IPG(Builder); 4312 Builder.restoreIP(Loc.IP); 4313 4314 uint32_t SrcLocStrSize; 4315 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4316 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4317 Value *ThreadId = getOrCreateThreadID(Ident); 4318 Value *Args[] = {ThreadId, Size, Allocator}; 4319 4320 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc); 4321 4322 return Builder.CreateCall(Fn, Args, Name); 4323 } 4324 4325 CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc, 4326 Value *Addr, Value *Allocator, 4327 std::string Name) { 4328 IRBuilder<>::InsertPointGuard IPG(Builder); 4329 Builder.restoreIP(Loc.IP); 4330 4331 uint32_t SrcLocStrSize; 4332 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4333 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4334 Value *ThreadId = getOrCreateThreadID(Ident); 4335 Value *Args[] = {ThreadId, Addr, Allocator}; 4336 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free); 4337 return Builder.CreateCall(Fn, Args, Name); 4338 } 4339 4340 CallInst *OpenMPIRBuilder::createOMPInteropInit( 4341 const LocationDescription &Loc, Value *InteropVar, 4342 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, 4343 Value *DependenceAddress, bool HaveNowaitClause) { 4344 IRBuilder<>::InsertPointGuard IPG(Builder); 4345 Builder.restoreIP(Loc.IP); 4346 4347 uint32_t SrcLocStrSize; 4348 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4349 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4350 Value *ThreadId = getOrCreateThreadID(Ident); 4351 if (Device == nullptr) 4352 Device = ConstantInt::get(Int32, -1); 4353 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType); 4354 if (NumDependences == nullptr) { 4355 NumDependences = ConstantInt::get(Int32, 0); 4356 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 4357 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 4358 } 4359 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 4360 Value *Args[] = { 4361 Ident, ThreadId, InteropVar, InteropTypeVal, 4362 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal}; 4363 4364 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init); 4365 4366 return Builder.CreateCall(Fn, Args); 4367 } 4368 4369 CallInst *OpenMPIRBuilder::createOMPInteropDestroy( 4370 const LocationDescription &Loc, Value *InteropVar, Value *Device, 4371 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) { 4372 IRBuilder<>::InsertPointGuard IPG(Builder); 4373 Builder.restoreIP(Loc.IP); 4374 4375 uint32_t SrcLocStrSize; 4376 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4377 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4378 Value *ThreadId = getOrCreateThreadID(Ident); 4379 if (Device == nullptr) 4380 Device = ConstantInt::get(Int32, -1); 4381 if (NumDependences == nullptr) { 4382 NumDependences = ConstantInt::get(Int32, 0); 4383 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 4384 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 4385 } 4386 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 4387 Value *Args[] = { 4388 Ident, ThreadId, InteropVar, Device, 4389 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 4390 4391 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy); 4392 4393 return Builder.CreateCall(Fn, Args); 4394 } 4395 4396 CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc, 4397 Value *InteropVar, Value *Device, 4398 Value *NumDependences, 4399 Value *DependenceAddress, 4400 bool HaveNowaitClause) { 4401 IRBuilder<>::InsertPointGuard IPG(Builder); 4402 Builder.restoreIP(Loc.IP); 4403 uint32_t SrcLocStrSize; 4404 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4405 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4406 Value *ThreadId = getOrCreateThreadID(Ident); 4407 if (Device == nullptr) 4408 Device = ConstantInt::get(Int32, -1); 4409 if (NumDependences == nullptr) { 4410 NumDependences = ConstantInt::get(Int32, 0); 4411 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext()); 4412 DependenceAddress = ConstantPointerNull::get(PointerTypeVar); 4413 } 4414 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause); 4415 Value *Args[] = { 4416 Ident, ThreadId, InteropVar, Device, 4417 NumDependences, DependenceAddress, HaveNowaitClauseVal}; 4418 4419 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use); 4420 4421 return Builder.CreateCall(Fn, Args); 4422 } 4423 4424 CallInst *OpenMPIRBuilder::createCachedThreadPrivate( 4425 const LocationDescription &Loc, llvm::Value *Pointer, 4426 llvm::ConstantInt *Size, const llvm::Twine &Name) { 4427 IRBuilder<>::InsertPointGuard IPG(Builder); 4428 Builder.restoreIP(Loc.IP); 4429 4430 uint32_t SrcLocStrSize; 4431 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4432 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4433 Value *ThreadId = getOrCreateThreadID(Ident); 4434 Constant *ThreadPrivateCache = 4435 getOrCreateInternalVariable(Int8PtrPtr, Name.str()); 4436 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache}; 4437 4438 Function *Fn = 4439 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); 4440 4441 return Builder.CreateCall(Fn, Args); 4442 } 4443 4444 OpenMPIRBuilder::InsertPointTy 4445 OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD, 4446 int32_t MinThreadsVal, int32_t MaxThreadsVal, 4447 int32_t MinTeamsVal, int32_t MaxTeamsVal) { 4448 if (!updateToLocation(Loc)) 4449 return Loc.IP; 4450 4451 uint32_t SrcLocStrSize; 4452 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4453 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4454 Constant *IsSPMDVal = ConstantInt::getSigned( 4455 Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); 4456 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD); 4457 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true); 4458 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0); 4459 4460 Function *Kernel = Builder.GetInsertBlock()->getParent(); 4461 4462 // Manifest the launch configuration in the metadata matching the kernel 4463 // environment. 4464 if (MinTeamsVal > 1 || MaxTeamsVal > 0) 4465 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal); 4466 4467 // For max values, < 0 means unset, == 0 means set but unknown. 4468 if (MaxThreadsVal < 0) 4469 MaxThreadsVal = std::max( 4470 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal); 4471 4472 if (MaxThreadsVal > 0) 4473 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal); 4474 4475 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal); 4476 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal); 4477 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal); 4478 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal); 4479 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); 4480 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); 4481 4482 // We need to strip the debug prefix to get the correct kernel name. 4483 StringRef KernelName = Kernel->getName(); 4484 const std::string DebugPrefix = "_debug__"; 4485 if (KernelName.ends_with(DebugPrefix)) 4486 KernelName = KernelName.drop_back(DebugPrefix.length()); 4487 4488 Function *Fn = getOrCreateRuntimeFunctionPtr( 4489 omp::RuntimeFunction::OMPRTL___kmpc_target_init); 4490 const DataLayout &DL = Fn->getParent()->getDataLayout(); 4491 4492 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment"; 4493 Constant *DynamicEnvironmentInitializer = 4494 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal}); 4495 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable( 4496 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage, 4497 DynamicEnvironmentInitializer, DynamicEnvironmentName, 4498 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, 4499 DL.getDefaultGlobalsAddressSpace()); 4500 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); 4501 4502 Constant *DynamicEnvironment = 4503 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr 4504 ? DynamicEnvironmentGV 4505 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV, 4506 DynamicEnvironmentPtr); 4507 4508 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get( 4509 ConfigurationEnvironment, { 4510 UseGenericStateMachineVal, 4511 MayUseNestedParallelismVal, 4512 IsSPMDVal, 4513 MinThreads, 4514 MaxThreads, 4515 MinTeams, 4516 MaxTeams, 4517 ReductionDataSize, 4518 ReductionBufferLength, 4519 }); 4520 Constant *KernelEnvironmentInitializer = ConstantStruct::get( 4521 KernelEnvironment, { 4522 ConfigurationEnvironmentInitializer, 4523 Ident, 4524 DynamicEnvironment, 4525 }); 4526 Twine KernelEnvironmentName = KernelName + "_kernel_environment"; 4527 GlobalVariable *KernelEnvironmentGV = new GlobalVariable( 4528 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage, 4529 KernelEnvironmentInitializer, KernelEnvironmentName, 4530 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, 4531 DL.getDefaultGlobalsAddressSpace()); 4532 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility); 4533 4534 Constant *KernelEnvironment = 4535 KernelEnvironmentGV->getType() == KernelEnvironmentPtr 4536 ? KernelEnvironmentGV 4537 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV, 4538 KernelEnvironmentPtr); 4539 Value *KernelLaunchEnvironment = Kernel->getArg(0); 4540 CallInst *ThreadKind = 4541 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment}); 4542 4543 Value *ExecUserCode = Builder.CreateICmpEQ( 4544 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1), 4545 "exec_user_code"); 4546 4547 // ThreadKind = __kmpc_target_init(...) 4548 // if (ThreadKind == -1) 4549 // user_code 4550 // else 4551 // return; 4552 4553 auto *UI = Builder.CreateUnreachable(); 4554 BasicBlock *CheckBB = UI->getParent(); 4555 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry"); 4556 4557 BasicBlock *WorkerExitBB = BasicBlock::Create( 4558 CheckBB->getContext(), "worker.exit", CheckBB->getParent()); 4559 Builder.SetInsertPoint(WorkerExitBB); 4560 Builder.CreateRetVoid(); 4561 4562 auto *CheckBBTI = CheckBB->getTerminator(); 4563 Builder.SetInsertPoint(CheckBBTI); 4564 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB); 4565 4566 CheckBBTI->eraseFromParent(); 4567 UI->eraseFromParent(); 4568 4569 // Continue in the "user_code" block, see diagram above and in 4570 // openmp/libomptarget/deviceRTLs/common/include/target.h . 4571 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt()); 4572 } 4573 4574 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc, 4575 int32_t TeamsReductionDataSize, 4576 int32_t TeamsReductionBufferLength) { 4577 if (!updateToLocation(Loc)) 4578 return; 4579 4580 Function *Fn = getOrCreateRuntimeFunctionPtr( 4581 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit); 4582 4583 Builder.CreateCall(Fn, {}); 4584 4585 if (!TeamsReductionBufferLength || !TeamsReductionDataSize) 4586 return; 4587 4588 Function *Kernel = Builder.GetInsertBlock()->getParent(); 4589 // We need to strip the debug prefix to get the correct kernel name. 4590 StringRef KernelName = Kernel->getName(); 4591 const std::string DebugPrefix = "_debug__"; 4592 if (KernelName.ends_with(DebugPrefix)) 4593 KernelName = KernelName.drop_back(DebugPrefix.length()); 4594 auto *KernelEnvironmentGV = 4595 M.getNamedGlobal((KernelName + "_kernel_environment").str()); 4596 assert(KernelEnvironmentGV && "Expected kernel environment global\n"); 4597 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer(); 4598 auto *NewInitializer = ConstantFoldInsertValueInstruction( 4599 KernelEnvironmentInitializer, 4600 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7}); 4601 NewInitializer = ConstantFoldInsertValueInstruction( 4602 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength), 4603 {0, 8}); 4604 KernelEnvironmentGV->setInitializer(NewInitializer); 4605 } 4606 4607 static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) { 4608 Module &M = *Kernel.getParent(); 4609 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 4610 for (auto *Op : MD->operands()) { 4611 if (Op->getNumOperands() != 3) 4612 continue; 4613 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0)); 4614 if (!KernelOp || KernelOp->getValue() != &Kernel) 4615 continue; 4616 auto *Prop = dyn_cast<MDString>(Op->getOperand(1)); 4617 if (!Prop || Prop->getString() != Name) 4618 continue; 4619 return Op; 4620 } 4621 return nullptr; 4622 } 4623 4624 static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, 4625 bool Min) { 4626 // Update the "maxntidx" metadata for NVIDIA, or add it. 4627 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name); 4628 if (ExistingOp) { 4629 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); 4630 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); 4631 ExistingOp->replaceOperandWith( 4632 2, ConstantAsMetadata::get(ConstantInt::get( 4633 OldVal->getValue()->getType(), 4634 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value)))); 4635 } else { 4636 LLVMContext &Ctx = Kernel.getContext(); 4637 Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel), 4638 MDString::get(Ctx, Name), 4639 ConstantAsMetadata::get( 4640 ConstantInt::get(Type::getInt32Ty(Ctx), Value))}; 4641 // Append metadata to nvvm.annotations 4642 Module &M = *Kernel.getParent(); 4643 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 4644 MD->addOperand(MDNode::get(Ctx, MDVals)); 4645 } 4646 } 4647 4648 std::pair<int32_t, int32_t> 4649 OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) { 4650 int32_t ThreadLimit = 4651 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit"); 4652 4653 if (T.isAMDGPU()) { 4654 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size"); 4655 if (!Attr.isValid() || !Attr.isStringAttribute()) 4656 return {0, ThreadLimit}; 4657 auto [LBStr, UBStr] = Attr.getValueAsString().split(','); 4658 int32_t LB, UB; 4659 if (!llvm::to_integer(UBStr, UB, 10)) 4660 return {0, ThreadLimit}; 4661 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB; 4662 if (!llvm::to_integer(LBStr, LB, 10)) 4663 return {0, UB}; 4664 return {LB, UB}; 4665 } 4666 4667 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) { 4668 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2)); 4669 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue(); 4670 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB}; 4671 } 4672 return {0, ThreadLimit}; 4673 } 4674 4675 void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T, 4676 Function &Kernel, int32_t LB, 4677 int32_t UB) { 4678 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB)); 4679 4680 if (T.isAMDGPU()) { 4681 Kernel.addFnAttr("amdgpu-flat-work-group-size", 4682 llvm::utostr(LB) + "," + llvm::utostr(UB)); 4683 return; 4684 } 4685 4686 updateNVPTXMetadata(Kernel, "maxntidx", UB, true); 4687 } 4688 4689 std::pair<int32_t, int32_t> 4690 OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) { 4691 // TODO: Read from backend annotations if available. 4692 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")}; 4693 } 4694 4695 void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel, 4696 int32_t LB, int32_t UB) { 4697 if (T.isNVPTX()) { 4698 if (UB > 0) 4699 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true); 4700 updateNVPTXMetadata(Kernel, "minctasm", LB, false); 4701 } 4702 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB)); 4703 } 4704 4705 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes( 4706 Function *OutlinedFn) { 4707 if (Config.isTargetDevice()) { 4708 OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage); 4709 // TODO: Determine if DSO local can be set to true. 4710 OutlinedFn->setDSOLocal(false); 4711 OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility); 4712 if (T.isAMDGCN()) 4713 OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL); 4714 } 4715 } 4716 4717 Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn, 4718 StringRef EntryFnIDName) { 4719 if (Config.isTargetDevice()) { 4720 assert(OutlinedFn && "The outlined function must exist if embedded"); 4721 return OutlinedFn; 4722 } 4723 4724 return new GlobalVariable( 4725 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage, 4726 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName); 4727 } 4728 4729 Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn, 4730 StringRef EntryFnName) { 4731 if (OutlinedFn) 4732 return OutlinedFn; 4733 4734 assert(!M.getGlobalVariable(EntryFnName, true) && 4735 "Named kernel already exists?"); 4736 return new GlobalVariable( 4737 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage, 4738 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName); 4739 } 4740 4741 void OpenMPIRBuilder::emitTargetRegionFunction( 4742 TargetRegionEntryInfo &EntryInfo, 4743 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, 4744 Function *&OutlinedFn, Constant *&OutlinedFnID) { 4745 4746 SmallString<64> EntryFnName; 4747 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo); 4748 4749 OutlinedFn = Config.isTargetDevice() || !Config.openMPOffloadMandatory() 4750 ? GenerateFunctionCallback(EntryFnName) 4751 : nullptr; 4752 4753 // If this target outline function is not an offload entry, we don't need to 4754 // register it. This may be in the case of a false if clause, or if there are 4755 // no OpenMP targets. 4756 if (!IsOffloadEntry) 4757 return; 4758 4759 std::string EntryFnIDName = 4760 Config.isTargetDevice() 4761 ? std::string(EntryFnName) 4762 : createPlatformSpecificName({EntryFnName, "region_id"}); 4763 4764 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn, 4765 EntryFnName, EntryFnIDName); 4766 } 4767 4768 Constant *OpenMPIRBuilder::registerTargetRegionFunction( 4769 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn, 4770 StringRef EntryFnName, StringRef EntryFnIDName) { 4771 if (OutlinedFn) 4772 setOutlinedTargetRegionFunctionAttributes(OutlinedFn); 4773 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName); 4774 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName); 4775 OffloadInfoManager.registerTargetRegionEntryInfo( 4776 EntryInfo, EntryAddr, OutlinedFnID, 4777 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion); 4778 return OutlinedFnID; 4779 } 4780 4781 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetData( 4782 const LocationDescription &Loc, InsertPointTy AllocaIP, 4783 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, 4784 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, 4785 omp::RuntimeFunction *MapperFunc, 4786 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> 4787 BodyGenCB, 4788 function_ref<void(unsigned int, Value *)> DeviceAddrCB, 4789 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) { 4790 if (!updateToLocation(Loc)) 4791 return InsertPointTy(); 4792 4793 // Disable TargetData CodeGen on Device pass. 4794 if (Config.IsTargetDevice.value_or(false)) 4795 return Builder.saveIP(); 4796 4797 Builder.restoreIP(CodeGenIP); 4798 bool IsStandAlone = !BodyGenCB; 4799 MapInfosTy *MapInfo; 4800 // Generate the code for the opening of the data environment. Capture all the 4801 // arguments of the runtime call by reference because they are used in the 4802 // closing of the region. 4803 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { 4804 MapInfo = &GenMapInfoCB(Builder.saveIP()); 4805 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info, 4806 /*IsNonContiguous=*/true, DeviceAddrCB, 4807 CustomMapperCB); 4808 4809 TargetDataRTArgs RTArgs; 4810 emitOffloadingArraysArgument(Builder, RTArgs, Info, 4811 !MapInfo->Names.empty()); 4812 4813 // Emit the number of elements in the offloading arrays. 4814 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs); 4815 4816 // Source location for the ident struct 4817 if (!SrcLocInfo) { 4818 uint32_t SrcLocStrSize; 4819 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4820 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4821 } 4822 4823 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID, 4824 PointerNum, RTArgs.BasePointersArray, 4825 RTArgs.PointersArray, RTArgs.SizesArray, 4826 RTArgs.MapTypesArray, RTArgs.MapNamesArray, 4827 RTArgs.MappersArray}; 4828 4829 if (IsStandAlone) { 4830 assert(MapperFunc && "MapperFunc missing for standalone target data"); 4831 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc), 4832 OffloadingArgs); 4833 } else { 4834 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr( 4835 omp::OMPRTL___tgt_target_data_begin_mapper); 4836 4837 Builder.CreateCall(BeginMapperFunc, OffloadingArgs); 4838 4839 for (auto DeviceMap : Info.DevicePtrInfoMap) { 4840 if (isa<AllocaInst>(DeviceMap.second.second)) { 4841 auto *LI = 4842 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first); 4843 Builder.CreateStore(LI, DeviceMap.second.second); 4844 } 4845 } 4846 4847 // If device pointer privatization is required, emit the body of the 4848 // region here. It will have to be duplicated: with and without 4849 // privatization. 4850 Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::Priv)); 4851 } 4852 }; 4853 4854 // If we need device pointer privatization, we need to emit the body of the 4855 // region with no privatization in the 'else' branch of the conditional. 4856 // Otherwise, we don't have to do anything. 4857 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { 4858 Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv)); 4859 }; 4860 4861 // Generate code for the closing of the data region. 4862 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) { 4863 TargetDataRTArgs RTArgs; 4864 emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(), 4865 /*ForEndCall=*/true); 4866 4867 // Emit the number of elements in the offloading arrays. 4868 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs); 4869 4870 // Source location for the ident struct 4871 if (!SrcLocInfo) { 4872 uint32_t SrcLocStrSize; 4873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 4874 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 4875 } 4876 4877 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID, 4878 PointerNum, RTArgs.BasePointersArray, 4879 RTArgs.PointersArray, RTArgs.SizesArray, 4880 RTArgs.MapTypesArray, RTArgs.MapNamesArray, 4881 RTArgs.MappersArray}; 4882 Function *EndMapperFunc = 4883 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper); 4884 4885 Builder.CreateCall(EndMapperFunc, OffloadingArgs); 4886 }; 4887 4888 // We don't have to do anything to close the region if the if clause evaluates 4889 // to false. 4890 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {}; 4891 4892 if (BodyGenCB) { 4893 if (IfCond) { 4894 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP); 4895 } else { 4896 BeginThenGen(AllocaIP, Builder.saveIP()); 4897 } 4898 4899 // If we don't require privatization of device pointers, we emit the body in 4900 // between the runtime calls. This avoids duplicating the body code. 4901 Builder.restoreIP(BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv)); 4902 4903 if (IfCond) { 4904 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP); 4905 } else { 4906 EndThenGen(AllocaIP, Builder.saveIP()); 4907 } 4908 } else { 4909 if (IfCond) { 4910 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP); 4911 } else { 4912 BeginThenGen(AllocaIP, Builder.saveIP()); 4913 } 4914 } 4915 4916 return Builder.saveIP(); 4917 } 4918 4919 FunctionCallee 4920 OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned, 4921 bool IsGPUDistribute) { 4922 assert((IVSize == 32 || IVSize == 64) && 4923 "IV size is not compatible with the omp runtime"); 4924 RuntimeFunction Name; 4925 if (IsGPUDistribute) 4926 Name = IVSize == 32 4927 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4 4928 : omp::OMPRTL___kmpc_distribute_static_init_4u) 4929 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8 4930 : omp::OMPRTL___kmpc_distribute_static_init_8u); 4931 else 4932 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4 4933 : omp::OMPRTL___kmpc_for_static_init_4u) 4934 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8 4935 : omp::OMPRTL___kmpc_for_static_init_8u); 4936 4937 return getOrCreateRuntimeFunction(M, Name); 4938 } 4939 4940 FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize, 4941 bool IVSigned) { 4942 assert((IVSize == 32 || IVSize == 64) && 4943 "IV size is not compatible with the omp runtime"); 4944 RuntimeFunction Name = IVSize == 32 4945 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4 4946 : omp::OMPRTL___kmpc_dispatch_init_4u) 4947 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8 4948 : omp::OMPRTL___kmpc_dispatch_init_8u); 4949 4950 return getOrCreateRuntimeFunction(M, Name); 4951 } 4952 4953 FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize, 4954 bool IVSigned) { 4955 assert((IVSize == 32 || IVSize == 64) && 4956 "IV size is not compatible with the omp runtime"); 4957 RuntimeFunction Name = IVSize == 32 4958 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4 4959 : omp::OMPRTL___kmpc_dispatch_next_4u) 4960 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8 4961 : omp::OMPRTL___kmpc_dispatch_next_8u); 4962 4963 return getOrCreateRuntimeFunction(M, Name); 4964 } 4965 4966 FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize, 4967 bool IVSigned) { 4968 assert((IVSize == 32 || IVSize == 64) && 4969 "IV size is not compatible with the omp runtime"); 4970 RuntimeFunction Name = IVSize == 32 4971 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4 4972 : omp::OMPRTL___kmpc_dispatch_fini_4u) 4973 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8 4974 : omp::OMPRTL___kmpc_dispatch_fini_8u); 4975 4976 return getOrCreateRuntimeFunction(M, Name); 4977 } 4978 4979 static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr, 4980 Function *Func) { 4981 for (User *User : make_early_inc_range(ConstExpr->users())) 4982 if (auto *Instr = dyn_cast<Instruction>(User)) 4983 if (Instr->getFunction() == Func) 4984 Instr->replaceUsesOfWith(ConstExpr, ConstExpr->getAsInstruction(Instr)); 4985 } 4986 4987 static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input, 4988 Function *Func) { 4989 for (User *User : make_early_inc_range(Input->users())) 4990 if (auto *Const = dyn_cast<Constant>(User)) 4991 if (auto *ConstExpr = dyn_cast<ConstantExpr>(Const)) 4992 replaceConstatExprUsesInFuncWithInstr(ConstExpr, Func); 4993 } 4994 4995 static Function *createOutlinedFunction( 4996 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, 4997 SmallVectorImpl<Value *> &Inputs, 4998 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, 4999 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { 5000 SmallVector<Type *> ParameterTypes; 5001 if (OMPBuilder.Config.isTargetDevice()) { 5002 // Add the "implicit" runtime argument we use to provide launch specific 5003 // information for target devices. 5004 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext()); 5005 ParameterTypes.push_back(Int8PtrTy); 5006 5007 // All parameters to target devices are passed as pointers 5008 // or i64. This assumes 64-bit address spaces/pointers. 5009 for (auto &Arg : Inputs) 5010 ParameterTypes.push_back(Arg->getType()->isPointerTy() 5011 ? Arg->getType() 5012 : Type::getInt64Ty(Builder.getContext())); 5013 } else { 5014 for (auto &Arg : Inputs) 5015 ParameterTypes.push_back(Arg->getType()); 5016 } 5017 5018 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes, 5019 /*isVarArg*/ false); 5020 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, 5021 Builder.GetInsertBlock()->getModule()); 5022 5023 // Save insert point. 5024 auto OldInsertPoint = Builder.saveIP(); 5025 5026 // Generate the region into the function. 5027 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func); 5028 Builder.SetInsertPoint(EntryBB); 5029 5030 // Insert target init call in the device compilation pass. 5031 if (OMPBuilder.Config.isTargetDevice()) 5032 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false)); 5033 5034 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock(); 5035 5036 // Insert target deinit call in the device compilation pass. 5037 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP())); 5038 if (OMPBuilder.Config.isTargetDevice()) 5039 OMPBuilder.createTargetDeinit(Builder); 5040 5041 // Insert return instruction. 5042 Builder.CreateRetVoid(); 5043 5044 // New Alloca IP at entry point of created device function. 5045 Builder.SetInsertPoint(EntryBB->getFirstNonPHI()); 5046 auto AllocaIP = Builder.saveIP(); 5047 5048 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg()); 5049 5050 // Skip the artificial dyn_ptr on the device. 5051 const auto &ArgRange = 5052 OMPBuilder.Config.isTargetDevice() 5053 ? make_range(Func->arg_begin() + 1, Func->arg_end()) 5054 : Func->args(); 5055 5056 // Rewrite uses of input valus to parameters. 5057 for (auto InArg : zip(Inputs, ArgRange)) { 5058 Value *Input = std::get<0>(InArg); 5059 Argument &Arg = std::get<1>(InArg); 5060 Value *InputCopy = nullptr; 5061 5062 Builder.restoreIP( 5063 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP())); 5064 5065 // Things like GEP's can come in the form of Constants. Constants and 5066 // ConstantExpr's do not have access to the knowledge of what they're 5067 // contained in, so we must dig a little to find an instruction so we can 5068 // tell if they're used inside of the function we're outlining. We also 5069 // replace the original constant expression with a new instruction 5070 // equivalent; an instruction as it allows easy modification in the 5071 // following loop, as we can now know the constant (instruction) is owned by 5072 // our target function and replaceUsesOfWith can now be invoked on it 5073 // (cannot do this with constants it seems). A brand new one also allows us 5074 // to be cautious as it is perhaps possible the old expression was used 5075 // inside of the function but exists and is used externally (unlikely by the 5076 // nature of a Constant, but still). 5077 replaceConstantValueUsesInFuncWithInstr(Input, Func); 5078 5079 // Collect all the instructions 5080 for (User *User : make_early_inc_range(Input->users())) 5081 if (auto *Instr = dyn_cast<Instruction>(User)) 5082 if (Instr->getFunction() == Func) 5083 Instr->replaceUsesOfWith(Input, InputCopy); 5084 } 5085 5086 // Restore insert point. 5087 Builder.restoreIP(OldInsertPoint); 5088 5089 return Func; 5090 } 5091 5092 static void emitTargetOutlinedFunction( 5093 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, 5094 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, 5095 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs, 5096 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, 5097 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) { 5098 5099 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction = 5100 [&OMPBuilder, &Builder, &Inputs, &CBFunc, 5101 &ArgAccessorFuncCB](StringRef EntryFnName) { 5102 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs, 5103 CBFunc, ArgAccessorFuncCB); 5104 }; 5105 5106 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true, 5107 OutlinedFn, OutlinedFnID); 5108 } 5109 5110 static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, 5111 OpenMPIRBuilder::InsertPointTy AllocaIP, 5112 Function *OutlinedFn, Constant *OutlinedFnID, 5113 int32_t NumTeams, int32_t NumThreads, 5114 SmallVectorImpl<Value *> &Args, 5115 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB) { 5116 5117 OpenMPIRBuilder::TargetDataInfo Info( 5118 /*RequiresDevicePointerInfo=*/false, 5119 /*SeparateBeginEndCalls=*/true); 5120 5121 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP()); 5122 OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info, 5123 /*IsNonContiguous=*/true); 5124 5125 OpenMPIRBuilder::TargetDataRTArgs RTArgs; 5126 OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info, 5127 !MapInfo.Names.empty()); 5128 5129 // emitKernelLaunch 5130 auto &&EmitTargetCallFallbackCB = 5131 [&](OpenMPIRBuilder::InsertPointTy IP) -> OpenMPIRBuilder::InsertPointTy { 5132 Builder.restoreIP(IP); 5133 Builder.CreateCall(OutlinedFn, Args); 5134 return Builder.saveIP(); 5135 }; 5136 5137 unsigned NumTargetItems = MapInfo.BasePointers.size(); 5138 // TODO: Use correct device ID 5139 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF); 5140 Value *NumTeamsVal = Builder.getInt32(NumTeams); 5141 Value *NumThreadsVal = Builder.getInt32(NumThreads); 5142 uint32_t SrcLocStrSize; 5143 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize); 5144 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize, 5145 llvm::omp::IdentFlag(0), 0); 5146 // TODO: Use correct NumIterations 5147 Value *NumIterations = Builder.getInt64(0); 5148 // TODO: Use correct DynCGGroupMem 5149 Value *DynCGGroupMem = Builder.getInt32(0); 5150 5151 bool HasNoWait = false; 5152 5153 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations, 5154 NumTeamsVal, NumThreadsVal, 5155 DynCGGroupMem, HasNoWait); 5156 5157 Builder.restoreIP(OMPBuilder.emitKernelLaunch( 5158 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, 5159 DeviceID, RTLoc, AllocaIP)); 5160 } 5161 5162 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget( 5163 const LocationDescription &Loc, InsertPointTy AllocaIP, 5164 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, 5165 int32_t NumThreads, SmallVectorImpl<Value *> &Args, 5166 GenMapInfoCallbackTy GenMapInfoCB, 5167 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc, 5168 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) { 5169 if (!updateToLocation(Loc)) 5170 return InsertPointTy(); 5171 5172 Builder.restoreIP(CodeGenIP); 5173 5174 Function *OutlinedFn; 5175 Constant *OutlinedFnID; 5176 emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn, 5177 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB); 5178 if (!Config.isTargetDevice()) 5179 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams, 5180 NumThreads, Args, GenMapInfoCB); 5181 5182 return Builder.saveIP(); 5183 } 5184 5185 std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts, 5186 StringRef FirstSeparator, 5187 StringRef Separator) { 5188 SmallString<128> Buffer; 5189 llvm::raw_svector_ostream OS(Buffer); 5190 StringRef Sep = FirstSeparator; 5191 for (StringRef Part : Parts) { 5192 OS << Sep << Part; 5193 Sep = Separator; 5194 } 5195 return OS.str().str(); 5196 } 5197 5198 std::string 5199 OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const { 5200 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(), 5201 Config.separator()); 5202 } 5203 5204 GlobalVariable * 5205 OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name, 5206 unsigned AddressSpace) { 5207 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first; 5208 if (Elem.second) { 5209 assert(Elem.second->getValueType() == Ty && 5210 "OMP internal variable has different type than requested"); 5211 } else { 5212 // TODO: investigate the appropriate linkage type used for the global 5213 // variable for possibly changing that to internal or private, or maybe 5214 // create different versions of the function for different OMP internal 5215 // variables. 5216 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0 5217 ? GlobalValue::ExternalLinkage 5218 : GlobalValue::CommonLinkage; 5219 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage, 5220 Constant::getNullValue(Ty), Elem.first(), 5221 /*InsertBefore=*/nullptr, 5222 GlobalValue::NotThreadLocal, AddressSpace); 5223 const DataLayout &DL = M.getDataLayout(); 5224 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty); 5225 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace); 5226 GV->setAlignment(std::max(TypeAlign, PtrAlign)); 5227 Elem.second = GV; 5228 } 5229 5230 return Elem.second; 5231 } 5232 5233 Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) { 5234 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str(); 5235 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", "."); 5236 return getOrCreateInternalVariable(KmpCriticalNameTy, Name); 5237 } 5238 5239 Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) { 5240 LLVMContext &Ctx = Builder.getContext(); 5241 Value *Null = 5242 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext())); 5243 Value *SizeGep = 5244 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1)); 5245 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx)); 5246 return SizePtrToInt; 5247 } 5248 5249 GlobalVariable * 5250 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings, 5251 std::string VarName) { 5252 llvm::Constant *MaptypesArrayInit = 5253 llvm::ConstantDataArray::get(M.getContext(), Mappings); 5254 auto *MaptypesArrayGlobal = new llvm::GlobalVariable( 5255 M, MaptypesArrayInit->getType(), 5256 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit, 5257 VarName); 5258 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); 5259 return MaptypesArrayGlobal; 5260 } 5261 5262 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc, 5263 InsertPointTy AllocaIP, 5264 unsigned NumOperands, 5265 struct MapperAllocas &MapperAllocas) { 5266 if (!updateToLocation(Loc)) 5267 return; 5268 5269 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 5270 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 5271 Builder.restoreIP(AllocaIP); 5272 AllocaInst *ArgsBase = Builder.CreateAlloca( 5273 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs"); 5274 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr, 5275 ".offload_ptrs"); 5276 AllocaInst *ArgSizes = Builder.CreateAlloca( 5277 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes"); 5278 Builder.restoreIP(Loc.IP); 5279 MapperAllocas.ArgsBase = ArgsBase; 5280 MapperAllocas.Args = Args; 5281 MapperAllocas.ArgSizes = ArgSizes; 5282 } 5283 5284 void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc, 5285 Function *MapperFunc, Value *SrcLocInfo, 5286 Value *MaptypesArg, Value *MapnamesArg, 5287 struct MapperAllocas &MapperAllocas, 5288 int64_t DeviceID, unsigned NumOperands) { 5289 if (!updateToLocation(Loc)) 5290 return; 5291 5292 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands); 5293 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands); 5294 Value *ArgsBaseGEP = 5295 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase, 5296 {Builder.getInt32(0), Builder.getInt32(0)}); 5297 Value *ArgsGEP = 5298 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args, 5299 {Builder.getInt32(0), Builder.getInt32(0)}); 5300 Value *ArgSizesGEP = 5301 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes, 5302 {Builder.getInt32(0), Builder.getInt32(0)}); 5303 Value *NullPtr = 5304 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext())); 5305 Builder.CreateCall(MapperFunc, 5306 {SrcLocInfo, Builder.getInt64(DeviceID), 5307 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP, 5308 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr}); 5309 } 5310 5311 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder, 5312 TargetDataRTArgs &RTArgs, 5313 TargetDataInfo &Info, 5314 bool EmitDebug, 5315 bool ForEndCall) { 5316 assert((!ForEndCall || Info.separateBeginEndCalls()) && 5317 "expected region end call to runtime only when end call is separate"); 5318 auto UnqualPtrTy = PointerType::getUnqual(M.getContext()); 5319 auto VoidPtrTy = UnqualPtrTy; 5320 auto VoidPtrPtrTy = UnqualPtrTy; 5321 auto Int64Ty = Type::getInt64Ty(M.getContext()); 5322 auto Int64PtrTy = UnqualPtrTy; 5323 5324 if (!Info.NumberOfPtrs) { 5325 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy); 5326 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy); 5327 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy); 5328 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy); 5329 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy); 5330 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy); 5331 return; 5332 } 5333 5334 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32( 5335 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), 5336 Info.RTArgs.BasePointersArray, 5337 /*Idx0=*/0, /*Idx1=*/0); 5338 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32( 5339 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 5340 /*Idx0=*/0, 5341 /*Idx1=*/0); 5342 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32( 5343 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray, 5344 /*Idx0=*/0, /*Idx1=*/0); 5345 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32( 5346 ArrayType::get(Int64Ty, Info.NumberOfPtrs), 5347 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd 5348 : Info.RTArgs.MapTypesArray, 5349 /*Idx0=*/0, 5350 /*Idx1=*/0); 5351 5352 // Only emit the mapper information arrays if debug information is 5353 // requested. 5354 if (!EmitDebug) 5355 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy); 5356 else 5357 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32( 5358 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray, 5359 /*Idx0=*/0, 5360 /*Idx1=*/0); 5361 // If there is no user-defined mapper, set the mapper array to nullptr to 5362 // avoid an unnecessary data privatization 5363 if (!Info.HasMapper) 5364 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy); 5365 else 5366 RTArgs.MappersArray = 5367 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy); 5368 } 5369 5370 void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP, 5371 InsertPointTy CodeGenIP, 5372 MapInfosTy &CombinedInfo, 5373 TargetDataInfo &Info) { 5374 MapInfosTy::StructNonContiguousInfo &NonContigInfo = 5375 CombinedInfo.NonContigInfo; 5376 5377 // Build an array of struct descriptor_dim and then assign it to 5378 // offload_args. 5379 // 5380 // struct descriptor_dim { 5381 // uint64_t offset; 5382 // uint64_t count; 5383 // uint64_t stride 5384 // }; 5385 Type *Int64Ty = Builder.getInt64Ty(); 5386 StructType *DimTy = StructType::create( 5387 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}), 5388 "struct.descriptor_dim"); 5389 5390 enum { OffsetFD = 0, CountFD, StrideFD }; 5391 // We need two index variable here since the size of "Dims" is the same as 5392 // the size of Components, however, the size of offset, count, and stride is 5393 // equal to the size of base declaration that is non-contiguous. 5394 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) { 5395 // Skip emitting ir if dimension size is 1 since it cannot be 5396 // non-contiguous. 5397 if (NonContigInfo.Dims[I] == 1) 5398 continue; 5399 Builder.restoreIP(AllocaIP); 5400 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]); 5401 AllocaInst *DimsAddr = 5402 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims"); 5403 Builder.restoreIP(CodeGenIP); 5404 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) { 5405 unsigned RevIdx = EE - II - 1; 5406 Value *DimsLVal = Builder.CreateInBoundsGEP( 5407 DimsAddr->getAllocatedType(), DimsAddr, 5408 {Builder.getInt64(0), Builder.getInt64(II)}); 5409 // Offset 5410 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD); 5411 Builder.CreateAlignedStore( 5412 NonContigInfo.Offsets[L][RevIdx], OffsetLVal, 5413 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType())); 5414 // Count 5415 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD); 5416 Builder.CreateAlignedStore( 5417 NonContigInfo.Counts[L][RevIdx], CountLVal, 5418 M.getDataLayout().getPrefTypeAlign(CountLVal->getType())); 5419 // Stride 5420 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD); 5421 Builder.CreateAlignedStore( 5422 NonContigInfo.Strides[L][RevIdx], StrideLVal, 5423 M.getDataLayout().getPrefTypeAlign(CountLVal->getType())); 5424 } 5425 // args[I] = &dims 5426 Builder.restoreIP(CodeGenIP); 5427 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast( 5428 DimsAddr, Builder.getPtrTy()); 5429 Value *P = Builder.CreateConstInBoundsGEP2_32( 5430 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs), 5431 Info.RTArgs.PointersArray, 0, I); 5432 Builder.CreateAlignedStore( 5433 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy())); 5434 ++L; 5435 } 5436 } 5437 5438 void OpenMPIRBuilder::emitOffloadingArrays( 5439 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, 5440 TargetDataInfo &Info, bool IsNonContiguous, 5441 function_ref<void(unsigned int, Value *)> DeviceAddrCB, 5442 function_ref<Value *(unsigned int)> CustomMapperCB) { 5443 5444 // Reset the array information. 5445 Info.clearArrayInfo(); 5446 Info.NumberOfPtrs = CombinedInfo.BasePointers.size(); 5447 5448 if (Info.NumberOfPtrs == 0) 5449 return; 5450 5451 Builder.restoreIP(AllocaIP); 5452 // Detect if we have any capture size requiring runtime evaluation of the 5453 // size so that a constant array could be eventually used. 5454 ArrayType *PointerArrayType = 5455 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs); 5456 5457 Info.RTArgs.BasePointersArray = Builder.CreateAlloca( 5458 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs"); 5459 5460 Info.RTArgs.PointersArray = Builder.CreateAlloca( 5461 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs"); 5462 AllocaInst *MappersArray = Builder.CreateAlloca( 5463 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers"); 5464 Info.RTArgs.MappersArray = MappersArray; 5465 5466 // If we don't have any VLA types or other types that require runtime 5467 // evaluation, we can use a constant array for the map sizes, otherwise we 5468 // need to fill up the arrays as we do for the pointers. 5469 Type *Int64Ty = Builder.getInt64Ty(); 5470 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(), 5471 ConstantInt::get(Int64Ty, 0)); 5472 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size()); 5473 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) { 5474 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) { 5475 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) { 5476 if (IsNonContiguous && 5477 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 5478 CombinedInfo.Types[I] & 5479 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG)) 5480 ConstSizes[I] = 5481 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]); 5482 else 5483 ConstSizes[I] = CI; 5484 continue; 5485 } 5486 } 5487 RuntimeSizes.set(I); 5488 } 5489 5490 if (RuntimeSizes.all()) { 5491 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs); 5492 Info.RTArgs.SizesArray = Builder.CreateAlloca( 5493 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); 5494 Builder.restoreIP(CodeGenIP); 5495 } else { 5496 auto *SizesArrayInit = ConstantArray::get( 5497 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes); 5498 std::string Name = createPlatformSpecificName({"offload_sizes"}); 5499 auto *SizesArrayGbl = 5500 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true, 5501 GlobalValue::PrivateLinkage, SizesArrayInit, Name); 5502 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); 5503 5504 if (!RuntimeSizes.any()) { 5505 Info.RTArgs.SizesArray = SizesArrayGbl; 5506 } else { 5507 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0); 5508 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64); 5509 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs); 5510 AllocaInst *Buffer = Builder.CreateAlloca( 5511 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes"); 5512 Buffer->setAlignment(OffloadSizeAlign); 5513 Builder.restoreIP(CodeGenIP); 5514 Builder.CreateMemCpy( 5515 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()), 5516 SizesArrayGbl, OffloadSizeAlign, 5517 Builder.getIntN( 5518 IndexSize, 5519 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue())); 5520 5521 Info.RTArgs.SizesArray = Buffer; 5522 } 5523 Builder.restoreIP(CodeGenIP); 5524 } 5525 5526 // The map types are always constant so we don't need to generate code to 5527 // fill arrays. Instead, we create an array constant. 5528 SmallVector<uint64_t, 4> Mapping; 5529 for (auto mapFlag : CombinedInfo.Types) 5530 Mapping.push_back( 5531 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 5532 mapFlag)); 5533 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"}); 5534 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName); 5535 Info.RTArgs.MapTypesArray = MapTypesArrayGbl; 5536 5537 // The information types are only built if provided. 5538 if (!CombinedInfo.Names.empty()) { 5539 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"}); 5540 auto *MapNamesArrayGbl = 5541 createOffloadMapnames(CombinedInfo.Names, MapnamesName); 5542 Info.RTArgs.MapNamesArray = MapNamesArrayGbl; 5543 } else { 5544 Info.RTArgs.MapNamesArray = 5545 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())); 5546 } 5547 5548 // If there's a present map type modifier, it must not be applied to the end 5549 // of a region, so generate a separate map type array in that case. 5550 if (Info.separateBeginEndCalls()) { 5551 bool EndMapTypesDiffer = false; 5552 for (uint64_t &Type : Mapping) { 5553 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 5554 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) { 5555 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>( 5556 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT); 5557 EndMapTypesDiffer = true; 5558 } 5559 } 5560 if (EndMapTypesDiffer) { 5561 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName); 5562 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl; 5563 } 5564 } 5565 5566 PointerType *PtrTy = Builder.getPtrTy(); 5567 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) { 5568 Value *BPVal = CombinedInfo.BasePointers[I]; 5569 Value *BP = Builder.CreateConstInBoundsGEP2_32( 5570 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray, 5571 0, I); 5572 Builder.CreateAlignedStore(BPVal, BP, 5573 M.getDataLayout().getPrefTypeAlign(PtrTy)); 5574 5575 if (Info.requiresDevicePointerInfo()) { 5576 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) { 5577 CodeGenIP = Builder.saveIP(); 5578 Builder.restoreIP(AllocaIP); 5579 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)}; 5580 Builder.restoreIP(CodeGenIP); 5581 if (DeviceAddrCB) 5582 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second); 5583 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) { 5584 Info.DevicePtrInfoMap[BPVal] = {BP, BP}; 5585 if (DeviceAddrCB) 5586 DeviceAddrCB(I, BP); 5587 } 5588 } 5589 5590 Value *PVal = CombinedInfo.Pointers[I]; 5591 Value *P = Builder.CreateConstInBoundsGEP2_32( 5592 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0, 5593 I); 5594 // TODO: Check alignment correct. 5595 Builder.CreateAlignedStore(PVal, P, 5596 M.getDataLayout().getPrefTypeAlign(PtrTy)); 5597 5598 if (RuntimeSizes.test(I)) { 5599 Value *S = Builder.CreateConstInBoundsGEP2_32( 5600 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray, 5601 /*Idx0=*/0, 5602 /*Idx1=*/I); 5603 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I], 5604 Int64Ty, 5605 /*isSigned=*/true), 5606 S, M.getDataLayout().getPrefTypeAlign(PtrTy)); 5607 } 5608 // Fill up the mapper array. 5609 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0); 5610 Value *MFunc = ConstantPointerNull::get(PtrTy); 5611 if (CustomMapperCB) 5612 if (Value *CustomMFunc = CustomMapperCB(I)) 5613 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy); 5614 Value *MAddr = Builder.CreateInBoundsGEP( 5615 MappersArray->getAllocatedType(), MappersArray, 5616 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)}); 5617 Builder.CreateAlignedStore( 5618 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType())); 5619 } 5620 5621 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() || 5622 Info.NumberOfPtrs == 0) 5623 return; 5624 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info); 5625 } 5626 5627 void OpenMPIRBuilder::emitBranch(BasicBlock *Target) { 5628 BasicBlock *CurBB = Builder.GetInsertBlock(); 5629 5630 if (!CurBB || CurBB->getTerminator()) { 5631 // If there is no insert point or the previous block is already 5632 // terminated, don't touch it. 5633 } else { 5634 // Otherwise, create a fall-through branch. 5635 Builder.CreateBr(Target); 5636 } 5637 5638 Builder.ClearInsertionPoint(); 5639 } 5640 5641 void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn, 5642 bool IsFinished) { 5643 BasicBlock *CurBB = Builder.GetInsertBlock(); 5644 5645 // Fall out of the current block (if necessary). 5646 emitBranch(BB); 5647 5648 if (IsFinished && BB->use_empty()) { 5649 BB->eraseFromParent(); 5650 return; 5651 } 5652 5653 // Place the block after the current block, if possible, or else at 5654 // the end of the function. 5655 if (CurBB && CurBB->getParent()) 5656 CurFn->insert(std::next(CurBB->getIterator()), BB); 5657 else 5658 CurFn->insert(CurFn->end(), BB); 5659 Builder.SetInsertPoint(BB); 5660 } 5661 5662 void OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, 5663 BodyGenCallbackTy ElseGen, 5664 InsertPointTy AllocaIP) { 5665 // If the condition constant folds and can be elided, try to avoid emitting 5666 // the condition and the dead arm of the if/else. 5667 if (auto *CI = dyn_cast<ConstantInt>(Cond)) { 5668 auto CondConstant = CI->getSExtValue(); 5669 if (CondConstant) 5670 ThenGen(AllocaIP, Builder.saveIP()); 5671 else 5672 ElseGen(AllocaIP, Builder.saveIP()); 5673 return; 5674 } 5675 5676 Function *CurFn = Builder.GetInsertBlock()->getParent(); 5677 5678 // Otherwise, the condition did not fold, or we couldn't elide it. Just 5679 // emit the conditional branch. 5680 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then"); 5681 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else"); 5682 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end"); 5683 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock); 5684 // Emit the 'then' code. 5685 emitBlock(ThenBlock, CurFn); 5686 ThenGen(AllocaIP, Builder.saveIP()); 5687 emitBranch(ContBlock); 5688 // Emit the 'else' code if present. 5689 // There is no need to emit line number for unconditional branch. 5690 emitBlock(ElseBlock, CurFn); 5691 ElseGen(AllocaIP, Builder.saveIP()); 5692 // There is no need to emit line number for unconditional branch. 5693 emitBranch(ContBlock); 5694 // Emit the continuation block for code after the if. 5695 emitBlock(ContBlock, CurFn, /*IsFinished=*/true); 5696 } 5697 5698 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic( 5699 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) { 5700 assert(!(AO == AtomicOrdering::NotAtomic || 5701 AO == llvm::AtomicOrdering::Unordered) && 5702 "Unexpected Atomic Ordering."); 5703 5704 bool Flush = false; 5705 llvm::AtomicOrdering FlushAO = AtomicOrdering::Monotonic; 5706 5707 switch (AK) { 5708 case Read: 5709 if (AO == AtomicOrdering::Acquire || AO == AtomicOrdering::AcquireRelease || 5710 AO == AtomicOrdering::SequentiallyConsistent) { 5711 FlushAO = AtomicOrdering::Acquire; 5712 Flush = true; 5713 } 5714 break; 5715 case Write: 5716 case Compare: 5717 case Update: 5718 if (AO == AtomicOrdering::Release || AO == AtomicOrdering::AcquireRelease || 5719 AO == AtomicOrdering::SequentiallyConsistent) { 5720 FlushAO = AtomicOrdering::Release; 5721 Flush = true; 5722 } 5723 break; 5724 case Capture: 5725 switch (AO) { 5726 case AtomicOrdering::Acquire: 5727 FlushAO = AtomicOrdering::Acquire; 5728 Flush = true; 5729 break; 5730 case AtomicOrdering::Release: 5731 FlushAO = AtomicOrdering::Release; 5732 Flush = true; 5733 break; 5734 case AtomicOrdering::AcquireRelease: 5735 case AtomicOrdering::SequentiallyConsistent: 5736 FlushAO = AtomicOrdering::AcquireRelease; 5737 Flush = true; 5738 break; 5739 default: 5740 // do nothing - leave silently. 5741 break; 5742 } 5743 } 5744 5745 if (Flush) { 5746 // Currently Flush RT call still doesn't take memory_ordering, so for when 5747 // that happens, this tries to do the resolution of which atomic ordering 5748 // to use with but issue the flush call 5749 // TODO: pass `FlushAO` after memory ordering support is added 5750 (void)FlushAO; 5751 emitFlush(Loc); 5752 } 5753 5754 // for AO == AtomicOrdering::Monotonic and all other case combinations 5755 // do nothing 5756 return Flush; 5757 } 5758 5759 OpenMPIRBuilder::InsertPointTy 5760 OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc, 5761 AtomicOpValue &X, AtomicOpValue &V, 5762 AtomicOrdering AO) { 5763 if (!updateToLocation(Loc)) 5764 return Loc.IP; 5765 5766 assert(X.Var->getType()->isPointerTy() && 5767 "OMP Atomic expects a pointer to target memory"); 5768 Type *XElemTy = X.ElemTy; 5769 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 5770 XElemTy->isPointerTy()) && 5771 "OMP atomic read expected a scalar type"); 5772 5773 Value *XRead = nullptr; 5774 5775 if (XElemTy->isIntegerTy()) { 5776 LoadInst *XLD = 5777 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read"); 5778 XLD->setAtomic(AO); 5779 XRead = cast<Value>(XLD); 5780 } else { 5781 // We need to perform atomic op as integer 5782 IntegerType *IntCastTy = 5783 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 5784 LoadInst *XLoad = 5785 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load"); 5786 XLoad->setAtomic(AO); 5787 if (XElemTy->isFloatingPointTy()) { 5788 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast"); 5789 } else { 5790 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast"); 5791 } 5792 } 5793 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read); 5794 Builder.CreateStore(XRead, V.Var, V.IsVolatile); 5795 return Builder.saveIP(); 5796 } 5797 5798 OpenMPIRBuilder::InsertPointTy 5799 OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, 5800 AtomicOpValue &X, Value *Expr, 5801 AtomicOrdering AO) { 5802 if (!updateToLocation(Loc)) 5803 return Loc.IP; 5804 5805 assert(X.Var->getType()->isPointerTy() && 5806 "OMP Atomic expects a pointer to target memory"); 5807 Type *XElemTy = X.ElemTy; 5808 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 5809 XElemTy->isPointerTy()) && 5810 "OMP atomic write expected a scalar type"); 5811 5812 if (XElemTy->isIntegerTy()) { 5813 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile); 5814 XSt->setAtomic(AO); 5815 } else { 5816 // We need to bitcast and perform atomic op as integers 5817 IntegerType *IntCastTy = 5818 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 5819 Value *ExprCast = 5820 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast"); 5821 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile); 5822 XSt->setAtomic(AO); 5823 } 5824 5825 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write); 5826 return Builder.saveIP(); 5827 } 5828 5829 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate( 5830 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 5831 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 5832 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { 5833 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous"); 5834 if (!updateToLocation(Loc)) 5835 return Loc.IP; 5836 5837 LLVM_DEBUG({ 5838 Type *XTy = X.Var->getType(); 5839 assert(XTy->isPointerTy() && 5840 "OMP Atomic expects a pointer to target memory"); 5841 Type *XElemTy = X.ElemTy; 5842 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 5843 XElemTy->isPointerTy()) && 5844 "OMP atomic update expected a scalar type"); 5845 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 5846 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) && 5847 "OpenMP atomic does not support LT or GT operations"); 5848 }); 5849 5850 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, 5851 X.IsVolatile, IsXBinopExpr); 5852 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); 5853 return Builder.saveIP(); 5854 } 5855 5856 // FIXME: Duplicating AtomicExpand 5857 Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2, 5858 AtomicRMWInst::BinOp RMWOp) { 5859 switch (RMWOp) { 5860 case AtomicRMWInst::Add: 5861 return Builder.CreateAdd(Src1, Src2); 5862 case AtomicRMWInst::Sub: 5863 return Builder.CreateSub(Src1, Src2); 5864 case AtomicRMWInst::And: 5865 return Builder.CreateAnd(Src1, Src2); 5866 case AtomicRMWInst::Nand: 5867 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2)); 5868 case AtomicRMWInst::Or: 5869 return Builder.CreateOr(Src1, Src2); 5870 case AtomicRMWInst::Xor: 5871 return Builder.CreateXor(Src1, Src2); 5872 case AtomicRMWInst::Xchg: 5873 case AtomicRMWInst::FAdd: 5874 case AtomicRMWInst::FSub: 5875 case AtomicRMWInst::BAD_BINOP: 5876 case AtomicRMWInst::Max: 5877 case AtomicRMWInst::Min: 5878 case AtomicRMWInst::UMax: 5879 case AtomicRMWInst::UMin: 5880 case AtomicRMWInst::FMax: 5881 case AtomicRMWInst::FMin: 5882 case AtomicRMWInst::UIncWrap: 5883 case AtomicRMWInst::UDecWrap: 5884 llvm_unreachable("Unsupported atomic update operation"); 5885 } 5886 llvm_unreachable("Unsupported atomic update operation"); 5887 } 5888 5889 std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate( 5890 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr, 5891 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, 5892 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) { 5893 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2 5894 // or a complex datatype. 5895 bool emitRMWOp = false; 5896 switch (RMWOp) { 5897 case AtomicRMWInst::Add: 5898 case AtomicRMWInst::And: 5899 case AtomicRMWInst::Nand: 5900 case AtomicRMWInst::Or: 5901 case AtomicRMWInst::Xor: 5902 case AtomicRMWInst::Xchg: 5903 emitRMWOp = XElemTy; 5904 break; 5905 case AtomicRMWInst::Sub: 5906 emitRMWOp = (IsXBinopExpr && XElemTy); 5907 break; 5908 default: 5909 emitRMWOp = false; 5910 } 5911 emitRMWOp &= XElemTy->isIntegerTy(); 5912 5913 std::pair<Value *, Value *> Res; 5914 if (emitRMWOp) { 5915 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO); 5916 // not needed except in case of postfix captures. Generate anyway for 5917 // consistency with the else part. Will be removed with any DCE pass. 5918 // AtomicRMWInst::Xchg does not have a coressponding instruction. 5919 if (RMWOp == AtomicRMWInst::Xchg) 5920 Res.second = Res.first; 5921 else 5922 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp); 5923 } else { 5924 IntegerType *IntCastTy = 5925 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits()); 5926 LoadInst *OldVal = 5927 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load"); 5928 OldVal->setAtomic(AO); 5929 // CurBB 5930 // | /---\ 5931 // ContBB | 5932 // | \---/ 5933 // ExitBB 5934 BasicBlock *CurBB = Builder.GetInsertBlock(); 5935 Instruction *CurBBTI = CurBB->getTerminator(); 5936 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 5937 BasicBlock *ExitBB = 5938 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit"); 5939 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(), 5940 X->getName() + ".atomic.cont"); 5941 ContBB->getTerminator()->eraseFromParent(); 5942 Builder.restoreIP(AllocaIP); 5943 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy); 5944 NewAtomicAddr->setName(X->getName() + "x.new.val"); 5945 Builder.SetInsertPoint(ContBB); 5946 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2); 5947 PHI->addIncoming(OldVal, CurBB); 5948 bool IsIntTy = XElemTy->isIntegerTy(); 5949 Value *OldExprVal = PHI; 5950 if (!IsIntTy) { 5951 if (XElemTy->isFloatingPointTy()) { 5952 OldExprVal = Builder.CreateBitCast(PHI, XElemTy, 5953 X->getName() + ".atomic.fltCast"); 5954 } else { 5955 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy, 5956 X->getName() + ".atomic.ptrCast"); 5957 } 5958 } 5959 5960 Value *Upd = UpdateOp(OldExprVal, Builder); 5961 Builder.CreateStore(Upd, NewAtomicAddr); 5962 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr); 5963 AtomicOrdering Failure = 5964 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 5965 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg( 5966 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure); 5967 Result->setVolatile(VolatileX); 5968 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0); 5969 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 5970 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock()); 5971 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB); 5972 5973 Res.first = OldExprVal; 5974 Res.second = Upd; 5975 5976 // set Insertion point in exit block 5977 if (UnreachableInst *ExitTI = 5978 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 5979 CurBBTI->eraseFromParent(); 5980 Builder.SetInsertPoint(ExitBB); 5981 } else { 5982 Builder.SetInsertPoint(ExitTI); 5983 } 5984 } 5985 5986 return Res; 5987 } 5988 5989 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture( 5990 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, 5991 AtomicOpValue &V, Value *Expr, AtomicOrdering AO, 5992 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, 5993 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { 5994 if (!updateToLocation(Loc)) 5995 return Loc.IP; 5996 5997 LLVM_DEBUG({ 5998 Type *XTy = X.Var->getType(); 5999 assert(XTy->isPointerTy() && 6000 "OMP Atomic expects a pointer to target memory"); 6001 Type *XElemTy = X.ElemTy; 6002 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() || 6003 XElemTy->isPointerTy()) && 6004 "OMP atomic capture expected a scalar type"); 6005 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) && 6006 "OpenMP atomic does not support LT or GT operations"); 6007 }); 6008 6009 // If UpdateExpr is 'x' updated with some `expr` not based on 'x', 6010 // 'x' is simply atomically rewritten with 'expr'. 6011 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); 6012 std::pair<Value *, Value *> Result = 6013 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, 6014 X.IsVolatile, IsXBinopExpr); 6015 6016 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second); 6017 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); 6018 6019 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture); 6020 return Builder.saveIP(); 6021 } 6022 6023 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( 6024 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 6025 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 6026 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 6027 bool IsFailOnly) { 6028 6029 AtomicOrdering Failure = AtomicCmpXchgInst::getStrongestFailureOrdering(AO); 6030 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr, 6031 IsPostfixUpdate, IsFailOnly, Failure); 6032 } 6033 6034 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare( 6035 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, 6036 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, 6037 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, 6038 bool IsFailOnly, AtomicOrdering Failure) { 6039 6040 if (!updateToLocation(Loc)) 6041 return Loc.IP; 6042 6043 assert(X.Var->getType()->isPointerTy() && 6044 "OMP atomic expects a pointer to target memory"); 6045 // compare capture 6046 if (V.Var) { 6047 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type"); 6048 assert(V.ElemTy == X.ElemTy && "x and v must be of same type"); 6049 } 6050 6051 bool IsInteger = E->getType()->isIntegerTy(); 6052 6053 if (Op == OMPAtomicCompareOp::EQ) { 6054 AtomicCmpXchgInst *Result = nullptr; 6055 if (!IsInteger) { 6056 IntegerType *IntCastTy = 6057 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits()); 6058 Value *EBCast = Builder.CreateBitCast(E, IntCastTy); 6059 Value *DBCast = Builder.CreateBitCast(D, IntCastTy); 6060 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(), 6061 AO, Failure); 6062 } else { 6063 Result = 6064 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure); 6065 } 6066 6067 if (V.Var) { 6068 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0); 6069 if (!IsInteger) 6070 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy); 6071 assert(OldValue->getType() == V.ElemTy && 6072 "OldValue and V must be of same type"); 6073 if (IsPostfixUpdate) { 6074 Builder.CreateStore(OldValue, V.Var, V.IsVolatile); 6075 } else { 6076 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1); 6077 if (IsFailOnly) { 6078 // CurBB---- 6079 // | | 6080 // v | 6081 // ContBB | 6082 // | | 6083 // v | 6084 // ExitBB <- 6085 // 6086 // where ContBB only contains the store of old value to 'v'. 6087 BasicBlock *CurBB = Builder.GetInsertBlock(); 6088 Instruction *CurBBTI = CurBB->getTerminator(); 6089 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable(); 6090 BasicBlock *ExitBB = CurBB->splitBasicBlock( 6091 CurBBTI, X.Var->getName() + ".atomic.exit"); 6092 BasicBlock *ContBB = CurBB->splitBasicBlock( 6093 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont"); 6094 ContBB->getTerminator()->eraseFromParent(); 6095 CurBB->getTerminator()->eraseFromParent(); 6096 6097 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB); 6098 6099 Builder.SetInsertPoint(ContBB); 6100 Builder.CreateStore(OldValue, V.Var); 6101 Builder.CreateBr(ExitBB); 6102 6103 if (UnreachableInst *ExitTI = 6104 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) { 6105 CurBBTI->eraseFromParent(); 6106 Builder.SetInsertPoint(ExitBB); 6107 } else { 6108 Builder.SetInsertPoint(ExitTI); 6109 } 6110 } else { 6111 Value *CapturedValue = 6112 Builder.CreateSelect(SuccessOrFail, E, OldValue); 6113 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 6114 } 6115 } 6116 } 6117 // The comparison result has to be stored. 6118 if (R.Var) { 6119 assert(R.Var->getType()->isPointerTy() && 6120 "r.var must be of pointer type"); 6121 assert(R.ElemTy->isIntegerTy() && "r must be of integral type"); 6122 6123 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1); 6124 Value *ResultCast = R.IsSigned 6125 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy) 6126 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy); 6127 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile); 6128 } 6129 } else { 6130 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) && 6131 "Op should be either max or min at this point"); 6132 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is =="); 6133 6134 // Reverse the ordop as the OpenMP forms are different from LLVM forms. 6135 // Let's take max as example. 6136 // OpenMP form: 6137 // x = x > expr ? expr : x; 6138 // LLVM form: 6139 // *ptr = *ptr > val ? *ptr : val; 6140 // We need to transform to LLVM form. 6141 // x = x <= expr ? x : expr; 6142 AtomicRMWInst::BinOp NewOp; 6143 if (IsXBinopExpr) { 6144 if (IsInteger) { 6145 if (X.IsSigned) 6146 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min 6147 : AtomicRMWInst::Max; 6148 else 6149 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin 6150 : AtomicRMWInst::UMax; 6151 } else { 6152 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin 6153 : AtomicRMWInst::FMax; 6154 } 6155 } else { 6156 if (IsInteger) { 6157 if (X.IsSigned) 6158 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max 6159 : AtomicRMWInst::Min; 6160 else 6161 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax 6162 : AtomicRMWInst::UMin; 6163 } else { 6164 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax 6165 : AtomicRMWInst::FMin; 6166 } 6167 } 6168 6169 AtomicRMWInst *OldValue = 6170 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO); 6171 if (V.Var) { 6172 Value *CapturedValue = nullptr; 6173 if (IsPostfixUpdate) { 6174 CapturedValue = OldValue; 6175 } else { 6176 CmpInst::Predicate Pred; 6177 switch (NewOp) { 6178 case AtomicRMWInst::Max: 6179 Pred = CmpInst::ICMP_SGT; 6180 break; 6181 case AtomicRMWInst::UMax: 6182 Pred = CmpInst::ICMP_UGT; 6183 break; 6184 case AtomicRMWInst::FMax: 6185 Pred = CmpInst::FCMP_OGT; 6186 break; 6187 case AtomicRMWInst::Min: 6188 Pred = CmpInst::ICMP_SLT; 6189 break; 6190 case AtomicRMWInst::UMin: 6191 Pred = CmpInst::ICMP_ULT; 6192 break; 6193 case AtomicRMWInst::FMin: 6194 Pred = CmpInst::FCMP_OLT; 6195 break; 6196 default: 6197 llvm_unreachable("unexpected comparison op"); 6198 } 6199 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E); 6200 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue); 6201 } 6202 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile); 6203 } 6204 } 6205 6206 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare); 6207 6208 return Builder.saveIP(); 6209 } 6210 6211 OpenMPIRBuilder::InsertPointTy 6212 OpenMPIRBuilder::createTeams(const LocationDescription &Loc, 6213 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower, 6214 Value *NumTeamsUpper, Value *ThreadLimit, 6215 Value *IfExpr) { 6216 if (!updateToLocation(Loc)) 6217 return InsertPointTy(); 6218 6219 uint32_t SrcLocStrSize; 6220 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); 6221 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); 6222 Function *CurrentFunction = Builder.GetInsertBlock()->getParent(); 6223 6224 // Outer allocation basicblock is the entry block of the current function. 6225 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock(); 6226 if (&OuterAllocaBB == Builder.GetInsertBlock()) { 6227 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry"); 6228 Builder.SetInsertPoint(BodyBB, BodyBB->begin()); 6229 } 6230 6231 // The current basic block is split into four basic blocks. After outlining, 6232 // they will be mapped as follows: 6233 // ``` 6234 // def current_fn() { 6235 // current_basic_block: 6236 // br label %teams.exit 6237 // teams.exit: 6238 // ; instructions after teams 6239 // } 6240 // 6241 // def outlined_fn() { 6242 // teams.alloca: 6243 // br label %teams.body 6244 // teams.body: 6245 // ; instructions within teams body 6246 // } 6247 // ``` 6248 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit"); 6249 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body"); 6250 BasicBlock *AllocaBB = 6251 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca"); 6252 6253 // Push num_teams 6254 if (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr) { 6255 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) && 6256 "if lowerbound is non-null, then upperbound must also be non-null " 6257 "for bounds on num_teams"); 6258 6259 if (NumTeamsUpper == nullptr) 6260 NumTeamsUpper = Builder.getInt32(0); 6261 6262 if (NumTeamsLower == nullptr) 6263 NumTeamsLower = NumTeamsUpper; 6264 6265 if (IfExpr) { 6266 assert(IfExpr->getType()->isIntegerTy() && 6267 "argument to if clause must be an integer value"); 6268 6269 // upper = ifexpr ? upper : 1 6270 if (IfExpr->getType() != Int1) 6271 IfExpr = Builder.CreateICmpNE(IfExpr, 6272 ConstantInt::get(IfExpr->getType(), 0)); 6273 NumTeamsUpper = Builder.CreateSelect( 6274 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper"); 6275 6276 // lower = ifexpr ? lower : 1 6277 NumTeamsLower = Builder.CreateSelect( 6278 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower"); 6279 } 6280 6281 if (ThreadLimit == nullptr) 6282 ThreadLimit = Builder.getInt32(0); 6283 6284 Value *ThreadNum = getOrCreateThreadID(Ident); 6285 Builder.CreateCall( 6286 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51), 6287 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit}); 6288 } 6289 // Generate the body of teams. 6290 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin()); 6291 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin()); 6292 BodyGenCB(AllocaIP, CodeGenIP); 6293 6294 OutlineInfo OI; 6295 OI.EntryBB = AllocaBB; 6296 OI.ExitBB = ExitBB; 6297 OI.OuterAllocaBB = &OuterAllocaBB; 6298 6299 // Insert fake values for global tid and bound tid. 6300 std::stack<Instruction *> ToBeDeleted; 6301 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin()); 6302 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 6303 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true)); 6304 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( 6305 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true)); 6306 6307 OI.PostOutlineCB = [this, Ident, ToBeDeleted](Function &OutlinedFn) mutable { 6308 // The stale call instruction will be replaced with a new call instruction 6309 // for runtime call with the outlined function. 6310 6311 assert(OutlinedFn.getNumUses() == 1 && 6312 "there must be a single user for the outlined function"); 6313 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back()); 6314 ToBeDeleted.push(StaleCI); 6315 6316 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) && 6317 "Outlined function must have two or three arguments only"); 6318 6319 bool HasShared = OutlinedFn.arg_size() == 3; 6320 6321 OutlinedFn.getArg(0)->setName("global.tid.ptr"); 6322 OutlinedFn.getArg(1)->setName("bound.tid.ptr"); 6323 if (HasShared) 6324 OutlinedFn.getArg(2)->setName("data"); 6325 6326 // Call to the runtime function for teams in the current function. 6327 assert(StaleCI && "Error while outlining - no CallInst user found for the " 6328 "outlined function."); 6329 Builder.SetInsertPoint(StaleCI); 6330 SmallVector<Value *> Args = { 6331 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn}; 6332 if (HasShared) 6333 Args.push_back(StaleCI->getArgOperand(2)); 6334 Builder.CreateCall(getOrCreateRuntimeFunctionPtr( 6335 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams), 6336 Args); 6337 6338 while (!ToBeDeleted.empty()) { 6339 ToBeDeleted.top()->eraseFromParent(); 6340 ToBeDeleted.pop(); 6341 } 6342 }; 6343 6344 addOutlineInfo(std::move(OI)); 6345 6346 Builder.SetInsertPoint(ExitBB, ExitBB->begin()); 6347 6348 return Builder.saveIP(); 6349 } 6350 6351 GlobalVariable * 6352 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names, 6353 std::string VarName) { 6354 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get( 6355 llvm::ArrayType::get(llvm::PointerType::getUnqual(M.getContext()), 6356 Names.size()), 6357 Names); 6358 auto *MapNamesArrayGlobal = new llvm::GlobalVariable( 6359 M, MapNamesArrayInit->getType(), 6360 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit, 6361 VarName); 6362 return MapNamesArrayGlobal; 6363 } 6364 6365 // Create all simple and struct types exposed by the runtime and remember 6366 // the llvm::PointerTypes of them for easy access later. 6367 void OpenMPIRBuilder::initializeTypes(Module &M) { 6368 LLVMContext &Ctx = M.getContext(); 6369 StructType *T; 6370 #define OMP_TYPE(VarName, InitValue) VarName = InitValue; 6371 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \ 6372 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \ 6373 VarName##PtrTy = PointerType::getUnqual(VarName##Ty); 6374 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \ 6375 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \ 6376 VarName##Ptr = PointerType::getUnqual(VarName); 6377 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \ 6378 T = StructType::getTypeByName(Ctx, StructName); \ 6379 if (!T) \ 6380 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \ 6381 VarName = T; \ 6382 VarName##Ptr = PointerType::getUnqual(T); 6383 #include "llvm/Frontend/OpenMP/OMPKinds.def" 6384 } 6385 6386 void OpenMPIRBuilder::OutlineInfo::collectBlocks( 6387 SmallPtrSetImpl<BasicBlock *> &BlockSet, 6388 SmallVectorImpl<BasicBlock *> &BlockVector) { 6389 SmallVector<BasicBlock *, 32> Worklist; 6390 BlockSet.insert(EntryBB); 6391 BlockSet.insert(ExitBB); 6392 6393 Worklist.push_back(EntryBB); 6394 while (!Worklist.empty()) { 6395 BasicBlock *BB = Worklist.pop_back_val(); 6396 BlockVector.push_back(BB); 6397 for (BasicBlock *SuccBB : successors(BB)) 6398 if (BlockSet.insert(SuccBB).second) 6399 Worklist.push_back(SuccBB); 6400 } 6401 } 6402 6403 void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, 6404 uint64_t Size, int32_t Flags, 6405 GlobalValue::LinkageTypes, 6406 StringRef Name) { 6407 if (!Config.isGPU()) { 6408 llvm::offloading::emitOffloadingEntry( 6409 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0, 6410 "omp_offloading_entries"); 6411 return; 6412 } 6413 // TODO: Add support for global variables on the device after declare target 6414 // support. 6415 Function *Fn = dyn_cast<Function>(Addr); 6416 if (!Fn) 6417 return; 6418 6419 Module &M = *(Fn->getParent()); 6420 LLVMContext &Ctx = M.getContext(); 6421 6422 // Get "nvvm.annotations" metadata node. 6423 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations"); 6424 6425 Metadata *MDVals[] = { 6426 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"), 6427 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))}; 6428 // Append metadata to nvvm.annotations. 6429 MD->addOperand(MDNode::get(Ctx, MDVals)); 6430 6431 // Add a function attribute for the kernel. 6432 Fn->addFnAttr(Attribute::get(Ctx, "kernel")); 6433 if (T.isAMDGCN()) 6434 Fn->addFnAttr("uniform-work-group-size", "true"); 6435 Fn->addFnAttr(Attribute::MustProgress); 6436 } 6437 6438 // We only generate metadata for function that contain target regions. 6439 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata( 6440 EmitMetadataErrorReportFunctionTy &ErrorFn) { 6441 6442 // If there are no entries, we don't need to do anything. 6443 if (OffloadInfoManager.empty()) 6444 return; 6445 6446 LLVMContext &C = M.getContext(); 6447 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *, 6448 TargetRegionEntryInfo>, 6449 16> 6450 OrderedEntries(OffloadInfoManager.size()); 6451 6452 // Auxiliary methods to create metadata values and strings. 6453 auto &&GetMDInt = [this](unsigned V) { 6454 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V)); 6455 }; 6456 6457 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); }; 6458 6459 // Create the offloading info metadata node. 6460 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info"); 6461 auto &&TargetRegionMetadataEmitter = 6462 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString]( 6463 const TargetRegionEntryInfo &EntryInfo, 6464 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) { 6465 // Generate metadata for target regions. Each entry of this metadata 6466 // contains: 6467 // - Entry 0 -> Kind of this type of metadata (0). 6468 // - Entry 1 -> Device ID of the file where the entry was identified. 6469 // - Entry 2 -> File ID of the file where the entry was identified. 6470 // - Entry 3 -> Mangled name of the function where the entry was 6471 // identified. 6472 // - Entry 4 -> Line in the file where the entry was identified. 6473 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line. 6474 // - Entry 6 -> Order the entry was created. 6475 // The first element of the metadata node is the kind. 6476 Metadata *Ops[] = { 6477 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID), 6478 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName), 6479 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count), 6480 GetMDInt(E.getOrder())}; 6481 6482 // Save this entry in the right position of the ordered entries array. 6483 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo); 6484 6485 // Add metadata to the named metadata node. 6486 MD->addOperand(MDNode::get(C, Ops)); 6487 }; 6488 6489 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter); 6490 6491 // Create function that emits metadata for each device global variable entry; 6492 auto &&DeviceGlobalVarMetadataEmitter = 6493 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD]( 6494 StringRef MangledName, 6495 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) { 6496 // Generate metadata for global variables. Each entry of this metadata 6497 // contains: 6498 // - Entry 0 -> Kind of this type of metadata (1). 6499 // - Entry 1 -> Mangled name of the variable. 6500 // - Entry 2 -> Declare target kind. 6501 // - Entry 3 -> Order the entry was created. 6502 // The first element of the metadata node is the kind. 6503 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName), 6504 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())}; 6505 6506 // Save this entry in the right position of the ordered entries array. 6507 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0); 6508 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo); 6509 6510 // Add metadata to the named metadata node. 6511 MD->addOperand(MDNode::get(C, Ops)); 6512 }; 6513 6514 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo( 6515 DeviceGlobalVarMetadataEmitter); 6516 6517 for (const auto &E : OrderedEntries) { 6518 assert(E.first && "All ordered entries must exist!"); 6519 if (const auto *CE = 6520 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>( 6521 E.first)) { 6522 if (!CE->getID() || !CE->getAddress()) { 6523 // Do not blame the entry if the parent funtion is not emitted. 6524 TargetRegionEntryInfo EntryInfo = E.second; 6525 StringRef FnName = EntryInfo.ParentName; 6526 if (!M.getNamedValue(FnName)) 6527 continue; 6528 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo); 6529 continue; 6530 } 6531 createOffloadEntry(CE->getID(), CE->getAddress(), 6532 /*Size=*/0, CE->getFlags(), 6533 GlobalValue::WeakAnyLinkage); 6534 } else if (const auto *CE = dyn_cast< 6535 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>( 6536 E.first)) { 6537 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags = 6538 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>( 6539 CE->getFlags()); 6540 switch (Flags) { 6541 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter: 6542 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo: 6543 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory()) 6544 continue; 6545 if (!CE->getAddress()) { 6546 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second); 6547 continue; 6548 } 6549 // The vaiable has no definition - no need to add the entry. 6550 if (CE->getVarSize() == 0) 6551 continue; 6552 break; 6553 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink: 6554 assert(((Config.isTargetDevice() && !CE->getAddress()) || 6555 (!Config.isTargetDevice() && CE->getAddress())) && 6556 "Declaret target link address is set."); 6557 if (Config.isTargetDevice()) 6558 continue; 6559 if (!CE->getAddress()) { 6560 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo()); 6561 continue; 6562 } 6563 break; 6564 default: 6565 break; 6566 } 6567 6568 // Hidden or internal symbols on the device are not externally visible. 6569 // We should not attempt to register them by creating an offloading 6570 // entry. Indirect variables are handled separately on the device. 6571 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress())) 6572 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) && 6573 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 6574 continue; 6575 6576 // Indirect globals need to use a special name that doesn't match the name 6577 // of the associated host global. 6578 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 6579 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), 6580 Flags, CE->getLinkage(), CE->getVarName()); 6581 else 6582 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(), 6583 Flags, CE->getLinkage()); 6584 6585 } else { 6586 llvm_unreachable("Unsupported entry kind."); 6587 } 6588 } 6589 } 6590 6591 void TargetRegionEntryInfo::getTargetRegionEntryFnName( 6592 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID, 6593 unsigned FileID, unsigned Line, unsigned Count) { 6594 raw_svector_ostream OS(Name); 6595 OS << "__omp_offloading" << llvm::format("_%x", DeviceID) 6596 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line; 6597 if (Count) 6598 OS << "_" << Count; 6599 } 6600 6601 void OffloadEntriesInfoManager::getTargetRegionEntryFnName( 6602 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) { 6603 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo); 6604 TargetRegionEntryInfo::getTargetRegionEntryFnName( 6605 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID, 6606 EntryInfo.Line, NewCount); 6607 } 6608 6609 TargetRegionEntryInfo 6610 OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, 6611 StringRef ParentName) { 6612 sys::fs::UniqueID ID; 6613 auto FileIDInfo = CallBack(); 6614 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) { 6615 report_fatal_error(("Unable to get unique ID for file, during " 6616 "getTargetEntryUniqueInfo, error message: " + 6617 EC.message()) 6618 .c_str()); 6619 } 6620 6621 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(), 6622 std::get<1>(FileIDInfo)); 6623 } 6624 6625 unsigned OpenMPIRBuilder::getFlagMemberOffset() { 6626 unsigned Offset = 0; 6627 for (uint64_t Remain = 6628 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 6629 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF); 6630 !(Remain & 1); Remain = Remain >> 1) 6631 Offset++; 6632 return Offset; 6633 } 6634 6635 omp::OpenMPOffloadMappingFlags 6636 OpenMPIRBuilder::getMemberOfFlag(unsigned Position) { 6637 // Rotate by getFlagMemberOffset() bits. 6638 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1) 6639 << getFlagMemberOffset()); 6640 } 6641 6642 void OpenMPIRBuilder::setCorrectMemberOfFlag( 6643 omp::OpenMPOffloadMappingFlags &Flags, 6644 omp::OpenMPOffloadMappingFlags MemberOfFlag) { 6645 // If the entry is PTR_AND_OBJ but has not been marked with the special 6646 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be 6647 // marked as MEMBER_OF. 6648 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 6649 Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ) && 6650 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>( 6651 (Flags & omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF) != 6652 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF)) 6653 return; 6654 6655 // Reset the placeholder value to prepare the flag for the assignment of the 6656 // proper MEMBER_OF value. 6657 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF; 6658 Flags |= MemberOfFlag; 6659 } 6660 6661 Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar( 6662 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 6663 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 6664 bool IsDeclaration, bool IsExternallyVisible, 6665 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 6666 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 6667 std::vector<Triple> TargetTriple, Type *LlvmPtrTy, 6668 std::function<Constant *()> GlobalInitializer, 6669 std::function<GlobalValue::LinkageTypes()> VariableLinkage) { 6670 // TODO: convert this to utilise the IRBuilder Config rather than 6671 // a passed down argument. 6672 if (OpenMPSIMD) 6673 return nullptr; 6674 6675 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink || 6676 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo || 6677 CaptureClause == 6678 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) && 6679 Config.hasRequiresUnifiedSharedMemory())) { 6680 SmallString<64> PtrName; 6681 { 6682 raw_svector_ostream OS(PtrName); 6683 OS << MangledName; 6684 if (!IsExternallyVisible) 6685 OS << format("_%x", EntryInfo.FileID); 6686 OS << "_decl_tgt_ref_ptr"; 6687 } 6688 6689 Value *Ptr = M.getNamedValue(PtrName); 6690 6691 if (!Ptr) { 6692 GlobalValue *GlobalValue = M.getNamedValue(MangledName); 6693 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName); 6694 6695 auto *GV = cast<GlobalVariable>(Ptr); 6696 GV->setLinkage(GlobalValue::WeakAnyLinkage); 6697 6698 if (!Config.isTargetDevice()) { 6699 if (GlobalInitializer) 6700 GV->setInitializer(GlobalInitializer()); 6701 else 6702 GV->setInitializer(GlobalValue); 6703 } 6704 6705 registerTargetGlobalVariable( 6706 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible, 6707 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple, 6708 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr)); 6709 } 6710 6711 return cast<Constant>(Ptr); 6712 } 6713 6714 return nullptr; 6715 } 6716 6717 void OpenMPIRBuilder::registerTargetGlobalVariable( 6718 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, 6719 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, 6720 bool IsDeclaration, bool IsExternallyVisible, 6721 TargetRegionEntryInfo EntryInfo, StringRef MangledName, 6722 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD, 6723 std::vector<Triple> TargetTriple, 6724 std::function<Constant *()> GlobalInitializer, 6725 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, 6726 Constant *Addr) { 6727 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny || 6728 (TargetTriple.empty() && !Config.isTargetDevice())) 6729 return; 6730 6731 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags; 6732 StringRef VarName; 6733 int64_t VarSize; 6734 GlobalValue::LinkageTypes Linkage; 6735 6736 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo || 6737 CaptureClause == 6738 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) && 6739 !Config.hasRequiresUnifiedSharedMemory()) { 6740 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; 6741 VarName = MangledName; 6742 GlobalValue *LlvmVal = M.getNamedValue(VarName); 6743 6744 if (!IsDeclaration) 6745 VarSize = divideCeil( 6746 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8); 6747 else 6748 VarSize = 0; 6749 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage(); 6750 6751 // This is a workaround carried over from Clang which prevents undesired 6752 // optimisation of internal variables. 6753 if (Config.isTargetDevice() && 6754 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) { 6755 // Do not create a "ref-variable" if the original is not also available 6756 // on the host. 6757 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName)) 6758 return; 6759 6760 std::string RefName = createPlatformSpecificName({VarName, "ref"}); 6761 6762 if (!M.getNamedValue(RefName)) { 6763 Constant *AddrRef = 6764 getOrCreateInternalVariable(Addr->getType(), RefName); 6765 auto *GvAddrRef = cast<GlobalVariable>(AddrRef); 6766 GvAddrRef->setConstant(true); 6767 GvAddrRef->setLinkage(GlobalValue::InternalLinkage); 6768 GvAddrRef->setInitializer(Addr); 6769 GeneratedRefs.push_back(GvAddrRef); 6770 } 6771 } 6772 } else { 6773 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink) 6774 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink; 6775 else 6776 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo; 6777 6778 if (Config.isTargetDevice()) { 6779 VarName = (Addr) ? Addr->getName() : ""; 6780 Addr = nullptr; 6781 } else { 6782 Addr = getAddrOfDeclareTargetVar( 6783 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible, 6784 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple, 6785 LlvmPtrTy, GlobalInitializer, VariableLinkage); 6786 VarName = (Addr) ? Addr->getName() : ""; 6787 } 6788 VarSize = M.getDataLayout().getPointerSize(); 6789 Linkage = GlobalValue::WeakAnyLinkage; 6790 } 6791 6792 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize, 6793 Flags, Linkage); 6794 } 6795 6796 /// Loads all the offload entries information from the host IR 6797 /// metadata. 6798 void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) { 6799 // If we are in target mode, load the metadata from the host IR. This code has 6800 // to match the metadata creation in createOffloadEntriesAndInfoMetadata(). 6801 6802 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName); 6803 if (!MD) 6804 return; 6805 6806 for (MDNode *MN : MD->operands()) { 6807 auto &&GetMDInt = [MN](unsigned Idx) { 6808 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx)); 6809 return cast<ConstantInt>(V->getValue())->getZExtValue(); 6810 }; 6811 6812 auto &&GetMDString = [MN](unsigned Idx) { 6813 auto *V = cast<MDString>(MN->getOperand(Idx)); 6814 return V->getString(); 6815 }; 6816 6817 switch (GetMDInt(0)) { 6818 default: 6819 llvm_unreachable("Unexpected metadata!"); 6820 break; 6821 case OffloadEntriesInfoManager::OffloadEntryInfo:: 6822 OffloadingEntryInfoTargetRegion: { 6823 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3), 6824 /*DeviceID=*/GetMDInt(1), 6825 /*FileID=*/GetMDInt(2), 6826 /*Line=*/GetMDInt(4), 6827 /*Count=*/GetMDInt(5)); 6828 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo, 6829 /*Order=*/GetMDInt(6)); 6830 break; 6831 } 6832 case OffloadEntriesInfoManager::OffloadEntryInfo:: 6833 OffloadingEntryInfoDeviceGlobalVar: 6834 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo( 6835 /*MangledName=*/GetMDString(1), 6836 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>( 6837 /*Flags=*/GetMDInt(2)), 6838 /*Order=*/GetMDInt(3)); 6839 break; 6840 } 6841 } 6842 } 6843 6844 void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath) { 6845 if (HostFilePath.empty()) 6846 return; 6847 6848 auto Buf = MemoryBuffer::getFile(HostFilePath); 6849 if (std::error_code Err = Buf.getError()) { 6850 report_fatal_error(("error opening host file from host file path inside of " 6851 "OpenMPIRBuilder: " + 6852 Err.message()) 6853 .c_str()); 6854 } 6855 6856 LLVMContext Ctx; 6857 auto M = expectedToErrorOrAndEmitErrors( 6858 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx)); 6859 if (std::error_code Err = M.getError()) { 6860 report_fatal_error( 6861 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message()) 6862 .c_str()); 6863 } 6864 6865 loadOffloadInfoMetadata(*M.get()); 6866 } 6867 6868 Function *OpenMPIRBuilder::createRegisterRequires(StringRef Name) { 6869 // Skip the creation of the registration function if this is device codegen 6870 if (Config.isTargetDevice()) 6871 return nullptr; 6872 6873 Builder.ClearInsertionPoint(); 6874 6875 // Create registration function prototype 6876 auto *RegFnTy = FunctionType::get(Builder.getVoidTy(), {}); 6877 auto *RegFn = Function::Create( 6878 RegFnTy, GlobalVariable::LinkageTypes::InternalLinkage, Name, M); 6879 RegFn->setSection(".text.startup"); 6880 RegFn->addFnAttr(Attribute::NoInline); 6881 RegFn->addFnAttr(Attribute::NoUnwind); 6882 6883 // Create registration function body 6884 auto *BB = BasicBlock::Create(M.getContext(), "entry", RegFn); 6885 ConstantInt *FlagsVal = 6886 ConstantInt::getSigned(Builder.getInt64Ty(), Config.getRequiresFlags()); 6887 Function *RTLRegFn = getOrCreateRuntimeFunctionPtr( 6888 omp::RuntimeFunction::OMPRTL___tgt_register_requires); 6889 6890 Builder.SetInsertPoint(BB); 6891 Builder.CreateCall(RTLRegFn, {FlagsVal}); 6892 Builder.CreateRetVoid(); 6893 6894 return RegFn; 6895 } 6896 6897 //===----------------------------------------------------------------------===// 6898 // OffloadEntriesInfoManager 6899 //===----------------------------------------------------------------------===// 6900 6901 bool OffloadEntriesInfoManager::empty() const { 6902 return OffloadEntriesTargetRegion.empty() && 6903 OffloadEntriesDeviceGlobalVar.empty(); 6904 } 6905 6906 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount( 6907 const TargetRegionEntryInfo &EntryInfo) const { 6908 auto It = OffloadEntriesTargetRegionCount.find( 6909 getTargetRegionEntryCountKey(EntryInfo)); 6910 if (It == OffloadEntriesTargetRegionCount.end()) 6911 return 0; 6912 return It->second; 6913 } 6914 6915 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount( 6916 const TargetRegionEntryInfo &EntryInfo) { 6917 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] = 6918 EntryInfo.Count + 1; 6919 } 6920 6921 /// Initialize target region entry. 6922 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo( 6923 const TargetRegionEntryInfo &EntryInfo, unsigned Order) { 6924 OffloadEntriesTargetRegion[EntryInfo] = 6925 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr, 6926 OMPTargetRegionEntryTargetRegion); 6927 ++OffloadingEntriesNum; 6928 } 6929 6930 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo( 6931 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, 6932 OMPTargetRegionEntryKind Flags) { 6933 assert(EntryInfo.Count == 0 && "expected default EntryInfo"); 6934 6935 // Update the EntryInfo with the next available count for this location. 6936 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo); 6937 6938 // If we are emitting code for a target, the entry is already initialized, 6939 // only has to be registered. 6940 if (OMPBuilder->Config.isTargetDevice()) { 6941 // This could happen if the device compilation is invoked standalone. 6942 if (!hasTargetRegionEntryInfo(EntryInfo)) { 6943 return; 6944 } 6945 auto &Entry = OffloadEntriesTargetRegion[EntryInfo]; 6946 Entry.setAddress(Addr); 6947 Entry.setID(ID); 6948 Entry.setFlags(Flags); 6949 } else { 6950 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion && 6951 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true)) 6952 return; 6953 assert(!hasTargetRegionEntryInfo(EntryInfo) && 6954 "Target region entry already registered!"); 6955 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags); 6956 OffloadEntriesTargetRegion[EntryInfo] = Entry; 6957 ++OffloadingEntriesNum; 6958 } 6959 incrementTargetRegionEntryInfoCount(EntryInfo); 6960 } 6961 6962 bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo( 6963 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const { 6964 6965 // Update the EntryInfo with the next available count for this location. 6966 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo); 6967 6968 auto It = OffloadEntriesTargetRegion.find(EntryInfo); 6969 if (It == OffloadEntriesTargetRegion.end()) { 6970 return false; 6971 } 6972 // Fail if this entry is already registered. 6973 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID())) 6974 return false; 6975 return true; 6976 } 6977 6978 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo( 6979 const OffloadTargetRegionEntryInfoActTy &Action) { 6980 // Scan all target region entries and perform the provided action. 6981 for (const auto &It : OffloadEntriesTargetRegion) { 6982 Action(It.first, It.second); 6983 } 6984 } 6985 6986 void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo( 6987 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) { 6988 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags); 6989 ++OffloadingEntriesNum; 6990 } 6991 6992 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo( 6993 StringRef VarName, Constant *Addr, int64_t VarSize, 6994 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) { 6995 if (OMPBuilder->Config.isTargetDevice()) { 6996 // This could happen if the device compilation is invoked standalone. 6997 if (!hasDeviceGlobalVarEntryInfo(VarName)) 6998 return; 6999 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName]; 7000 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) { 7001 if (Entry.getVarSize() == 0) { 7002 Entry.setVarSize(VarSize); 7003 Entry.setLinkage(Linkage); 7004 } 7005 return; 7006 } 7007 Entry.setVarSize(VarSize); 7008 Entry.setLinkage(Linkage); 7009 Entry.setAddress(Addr); 7010 } else { 7011 if (hasDeviceGlobalVarEntryInfo(VarName)) { 7012 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName]; 7013 assert(Entry.isValid() && Entry.getFlags() == Flags && 7014 "Entry not initialized!"); 7015 if (Entry.getVarSize() == 0) { 7016 Entry.setVarSize(VarSize); 7017 Entry.setLinkage(Linkage); 7018 } 7019 return; 7020 } 7021 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect) 7022 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum, 7023 Addr, VarSize, Flags, Linkage, 7024 VarName.str()); 7025 else 7026 OffloadEntriesDeviceGlobalVar.try_emplace( 7027 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, ""); 7028 ++OffloadingEntriesNum; 7029 } 7030 } 7031 7032 void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo( 7033 const OffloadDeviceGlobalVarEntryInfoActTy &Action) { 7034 // Scan all target region entries and perform the provided action. 7035 for (const auto &E : OffloadEntriesDeviceGlobalVar) 7036 Action(E.getKey(), E.getValue()); 7037 } 7038 7039 //===----------------------------------------------------------------------===// 7040 // CanonicalLoopInfo 7041 //===----------------------------------------------------------------------===// 7042 7043 void CanonicalLoopInfo::collectControlBlocks( 7044 SmallVectorImpl<BasicBlock *> &BBs) { 7045 // We only count those BBs as control block for which we do not need to 7046 // reverse the CFG, i.e. not the loop body which can contain arbitrary control 7047 // flow. For consistency, this also means we do not add the Body block, which 7048 // is just the entry to the body code. 7049 BBs.reserve(BBs.size() + 6); 7050 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()}); 7051 } 7052 7053 BasicBlock *CanonicalLoopInfo::getPreheader() const { 7054 assert(isValid() && "Requires a valid canonical loop"); 7055 for (BasicBlock *Pred : predecessors(Header)) { 7056 if (Pred != Latch) 7057 return Pred; 7058 } 7059 llvm_unreachable("Missing preheader"); 7060 } 7061 7062 void CanonicalLoopInfo::setTripCount(Value *TripCount) { 7063 assert(isValid() && "Requires a valid canonical loop"); 7064 7065 Instruction *CmpI = &getCond()->front(); 7066 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount"); 7067 CmpI->setOperand(1, TripCount); 7068 7069 #ifndef NDEBUG 7070 assertOK(); 7071 #endif 7072 } 7073 7074 void CanonicalLoopInfo::mapIndVar( 7075 llvm::function_ref<Value *(Instruction *)> Updater) { 7076 assert(isValid() && "Requires a valid canonical loop"); 7077 7078 Instruction *OldIV = getIndVar(); 7079 7080 // Record all uses excluding those introduced by the updater. Uses by the 7081 // CanonicalLoopInfo itself to keep track of the number of iterations are 7082 // excluded. 7083 SmallVector<Use *> ReplacableUses; 7084 for (Use &U : OldIV->uses()) { 7085 auto *User = dyn_cast<Instruction>(U.getUser()); 7086 if (!User) 7087 continue; 7088 if (User->getParent() == getCond()) 7089 continue; 7090 if (User->getParent() == getLatch()) 7091 continue; 7092 ReplacableUses.push_back(&U); 7093 } 7094 7095 // Run the updater that may introduce new uses 7096 Value *NewIV = Updater(OldIV); 7097 7098 // Replace the old uses with the value returned by the updater. 7099 for (Use *U : ReplacableUses) 7100 U->set(NewIV); 7101 7102 #ifndef NDEBUG 7103 assertOK(); 7104 #endif 7105 } 7106 7107 void CanonicalLoopInfo::assertOK() const { 7108 #ifndef NDEBUG 7109 // No constraints if this object currently does not describe a loop. 7110 if (!isValid()) 7111 return; 7112 7113 BasicBlock *Preheader = getPreheader(); 7114 BasicBlock *Body = getBody(); 7115 BasicBlock *After = getAfter(); 7116 7117 // Verify standard control-flow we use for OpenMP loops. 7118 assert(Preheader); 7119 assert(isa<BranchInst>(Preheader->getTerminator()) && 7120 "Preheader must terminate with unconditional branch"); 7121 assert(Preheader->getSingleSuccessor() == Header && 7122 "Preheader must jump to header"); 7123 7124 assert(Header); 7125 assert(isa<BranchInst>(Header->getTerminator()) && 7126 "Header must terminate with unconditional branch"); 7127 assert(Header->getSingleSuccessor() == Cond && 7128 "Header must jump to exiting block"); 7129 7130 assert(Cond); 7131 assert(Cond->getSinglePredecessor() == Header && 7132 "Exiting block only reachable from header"); 7133 7134 assert(isa<BranchInst>(Cond->getTerminator()) && 7135 "Exiting block must terminate with conditional branch"); 7136 assert(size(successors(Cond)) == 2 && 7137 "Exiting block must have two successors"); 7138 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body && 7139 "Exiting block's first successor jump to the body"); 7140 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit && 7141 "Exiting block's second successor must exit the loop"); 7142 7143 assert(Body); 7144 assert(Body->getSinglePredecessor() == Cond && 7145 "Body only reachable from exiting block"); 7146 assert(!isa<PHINode>(Body->front())); 7147 7148 assert(Latch); 7149 assert(isa<BranchInst>(Latch->getTerminator()) && 7150 "Latch must terminate with unconditional branch"); 7151 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header"); 7152 // TODO: To support simple redirecting of the end of the body code that has 7153 // multiple; introduce another auxiliary basic block like preheader and after. 7154 assert(Latch->getSinglePredecessor() != nullptr); 7155 assert(!isa<PHINode>(Latch->front())); 7156 7157 assert(Exit); 7158 assert(isa<BranchInst>(Exit->getTerminator()) && 7159 "Exit block must terminate with unconditional branch"); 7160 assert(Exit->getSingleSuccessor() == After && 7161 "Exit block must jump to after block"); 7162 7163 assert(After); 7164 assert(After->getSinglePredecessor() == Exit && 7165 "After block only reachable from exit block"); 7166 assert(After->empty() || !isa<PHINode>(After->front())); 7167 7168 Instruction *IndVar = getIndVar(); 7169 assert(IndVar && "Canonical induction variable not found?"); 7170 assert(isa<IntegerType>(IndVar->getType()) && 7171 "Induction variable must be an integer"); 7172 assert(cast<PHINode>(IndVar)->getParent() == Header && 7173 "Induction variable must be a PHI in the loop header"); 7174 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader); 7175 assert( 7176 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero()); 7177 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch); 7178 7179 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1); 7180 assert(cast<Instruction>(NextIndVar)->getParent() == Latch); 7181 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add); 7182 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar); 7183 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1)) 7184 ->isOne()); 7185 7186 Value *TripCount = getTripCount(); 7187 assert(TripCount && "Loop trip count not found?"); 7188 assert(IndVar->getType() == TripCount->getType() && 7189 "Trip count and induction variable must have the same type"); 7190 7191 auto *CmpI = cast<CmpInst>(&Cond->front()); 7192 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT && 7193 "Exit condition must be a signed less-than comparison"); 7194 assert(CmpI->getOperand(0) == IndVar && 7195 "Exit condition must compare the induction variable"); 7196 assert(CmpI->getOperand(1) == TripCount && 7197 "Exit condition must compare with the trip count"); 7198 #endif 7199 } 7200 7201 void CanonicalLoopInfo::invalidate() { 7202 Header = nullptr; 7203 Cond = nullptr; 7204 Latch = nullptr; 7205 Exit = nullptr; 7206 } 7207