1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a generalized class for OpenMP runtime code generation 10 // specialized by GPU targets NVPTX and AMDGCN. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGOpenMPRuntimeGPU.h" 15 #include "CodeGenFunction.h" 16 #include "clang/AST/Attr.h" 17 #include "clang/AST/DeclOpenMP.h" 18 #include "clang/AST/OpenMPClause.h" 19 #include "clang/AST/StmtOpenMP.h" 20 #include "clang/AST/StmtVisitor.h" 21 #include "clang/Basic/Cuda.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 24 #include "llvm/Support/MathExtras.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 using namespace llvm::omp; 29 30 namespace { 31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32 class NVPTXActionTy final : public PrePostActionTy { 33 llvm::FunctionCallee EnterCallee = nullptr; 34 ArrayRef<llvm::Value *> EnterArgs; 35 llvm::FunctionCallee ExitCallee = nullptr; 36 ArrayRef<llvm::Value *> ExitArgs; 37 bool Conditional = false; 38 llvm::BasicBlock *ContBlock = nullptr; 39 40 public: 41 NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42 ArrayRef<llvm::Value *> EnterArgs, 43 llvm::FunctionCallee ExitCallee, 44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46 ExitArgs(ExitArgs), Conditional(Conditional) {} 47 void Enter(CodeGenFunction &CGF) override { 48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49 if (Conditional) { 50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52 ContBlock = CGF.createBasicBlock("omp_if.end"); 53 // Generate the branch (If-stmt) 54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55 CGF.EmitBlock(ThenBlock); 56 } 57 } 58 void Done(CodeGenFunction &CGF) { 59 // Emit the rest of blocks/branches 60 CGF.EmitBranch(ContBlock); 61 CGF.EmitBlock(ContBlock, true); 62 } 63 void Exit(CodeGenFunction &CGF) override { 64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65 } 66 }; 67 68 /// A class to track the execution mode when codegening directives within 69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70 /// to the target region and used by containing directives such as 'parallel' 71 /// to emit optimized code. 72 class ExecutionRuntimeModesRAII { 73 private: 74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75 CGOpenMPRuntimeGPU::EM_Unknown; 76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77 78 public: 79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode) 81 : ExecMode(ExecMode) { 82 SavedExecMode = ExecMode; 83 ExecMode = EntryMode; 84 } 85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } 86 }; 87 88 /// GPU Configuration: This information can be derived from cuda registers, 89 /// however, providing compile time constants helps generate more efficient 90 /// code. For all practical purposes this is fine because the configuration 91 /// is the same for all known NVPTX architectures. 92 enum MachineConfiguration : unsigned { 93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target 94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size 95 96 /// Global memory alignment for performance. 97 GlobalMemoryAlignment = 128, 98 }; 99 100 static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 101 RefExpr = RefExpr->IgnoreParens(); 102 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 103 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 104 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 105 Base = TempASE->getBase()->IgnoreParenImpCasts(); 106 RefExpr = Base; 107 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) { 108 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 109 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base)) 110 Base = TempOASE->getBase()->IgnoreParenImpCasts(); 111 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 112 Base = TempASE->getBase()->IgnoreParenImpCasts(); 113 RefExpr = Base; 114 } 115 RefExpr = RefExpr->IgnoreParenImpCasts(); 116 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 117 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 118 const auto *ME = cast<MemberExpr>(RefExpr); 119 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 120 } 121 122 123 static RecordDecl *buildRecordForGlobalizedVars( 124 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 125 ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 126 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 127 &MappedDeclsFields, int BufSize) { 128 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 129 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 130 return nullptr; 131 SmallVector<VarsDataTy, 4> GlobalizedVars; 132 for (const ValueDecl *D : EscapedDecls) 133 GlobalizedVars.emplace_back( 134 CharUnits::fromQuantity(std::max( 135 C.getDeclAlign(D).getQuantity(), 136 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))), 137 D); 138 for (const ValueDecl *D : EscapedDeclsForTeams) 139 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 140 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) { 141 return L.first > R.first; 142 }); 143 144 // Build struct _globalized_locals_ty { 145 // /* globalized vars */[WarSize] align (max(decl_align, 146 // GlobalMemoryAlignment)) 147 // /* globalized vars */ for EscapedDeclsForTeams 148 // }; 149 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 150 GlobalizedRD->startDefinition(); 151 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 152 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 153 for (const auto &Pair : GlobalizedVars) { 154 const ValueDecl *VD = Pair.second; 155 QualType Type = VD->getType(); 156 if (Type->isLValueReferenceType()) 157 Type = C.getPointerType(Type.getNonReferenceType()); 158 else 159 Type = Type.getNonReferenceType(); 160 SourceLocation Loc = VD->getLocation(); 161 FieldDecl *Field; 162 if (SingleEscaped.count(VD)) { 163 Field = FieldDecl::Create( 164 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 165 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 166 /*BW=*/nullptr, /*Mutable=*/false, 167 /*InitStyle=*/ICIS_NoInit); 168 Field->setAccess(AS_public); 169 if (VD->hasAttrs()) { 170 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 171 E(VD->getAttrs().end()); 172 I != E; ++I) 173 Field->addAttr(*I); 174 } 175 } else { 176 llvm::APInt ArraySize(32, BufSize); 177 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal, 178 0); 179 Field = FieldDecl::Create( 180 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 181 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 182 /*BW=*/nullptr, /*Mutable=*/false, 183 /*InitStyle=*/ICIS_NoInit); 184 Field->setAccess(AS_public); 185 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(), 186 static_cast<CharUnits::QuantityType>( 187 GlobalMemoryAlignment))); 188 Field->addAttr(AlignedAttr::CreateImplicit( 189 C, /*IsAlignmentExpr=*/true, 190 IntegerLiteral::Create(C, Align, 191 C.getIntTypeForBitwidth(32, /*Signed=*/0), 192 SourceLocation()), 193 {}, AlignedAttr::GNU_aligned)); 194 } 195 GlobalizedRD->addDecl(Field); 196 MappedDeclsFields.try_emplace(VD, Field); 197 } 198 GlobalizedRD->completeDefinition(); 199 return GlobalizedRD; 200 } 201 202 /// Get the list of variables that can escape their declaration context. 203 class CheckVarsEscapingDeclContext final 204 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 205 CodeGenFunction &CGF; 206 llvm::SetVector<const ValueDecl *> EscapedDecls; 207 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 208 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; 209 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 210 RecordDecl *GlobalizedRD = nullptr; 211 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 212 bool AllEscaped = false; 213 bool IsForCombinedParallelRegion = false; 214 215 void markAsEscaped(const ValueDecl *VD) { 216 // Do not globalize declare target variables. 217 if (!isa<VarDecl>(VD) || 218 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 219 return; 220 VD = cast<ValueDecl>(VD->getCanonicalDecl()); 221 // Use user-specified allocation. 222 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 223 return; 224 // Variables captured by value must be globalized. 225 bool IsCaptured = false; 226 if (auto *CSI = CGF.CapturedStmtInfo) { 227 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 228 // Check if need to capture the variable that was already captured by 229 // value in the outer region. 230 IsCaptured = true; 231 if (!IsForCombinedParallelRegion) { 232 if (!FD->hasAttrs()) 233 return; 234 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 235 if (!Attr) 236 return; 237 if (((Attr->getCaptureKind() != OMPC_map) && 238 !isOpenMPPrivate(Attr->getCaptureKind())) || 239 ((Attr->getCaptureKind() == OMPC_map) && 240 !FD->getType()->isAnyPointerType())) 241 return; 242 } 243 if (!FD->getType()->isReferenceType()) { 244 assert(!VD->getType()->isVariablyModifiedType() && 245 "Parameter captured by value with variably modified type"); 246 EscapedParameters.insert(VD); 247 } else if (!IsForCombinedParallelRegion) { 248 return; 249 } 250 } 251 } 252 if ((!CGF.CapturedStmtInfo || 253 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 254 VD->getType()->isReferenceType()) 255 // Do not globalize variables with reference type. 256 return; 257 if (VD->getType()->isVariablyModifiedType()) { 258 // If not captured at the target region level then mark the escaped 259 // variable as delayed. 260 if (IsCaptured) 261 EscapedVariableLengthDecls.insert(VD); 262 else 263 DelayedVariableLengthDecls.insert(VD); 264 } else 265 EscapedDecls.insert(VD); 266 } 267 268 void VisitValueDecl(const ValueDecl *VD) { 269 if (VD->getType()->isLValueReferenceType()) 270 markAsEscaped(VD); 271 if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 272 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 273 const bool SavedAllEscaped = AllEscaped; 274 AllEscaped = VD->getType()->isLValueReferenceType(); 275 Visit(VarD->getInit()); 276 AllEscaped = SavedAllEscaped; 277 } 278 } 279 } 280 void VisitOpenMPCapturedStmt(const CapturedStmt *S, 281 ArrayRef<OMPClause *> Clauses, 282 bool IsCombinedParallelRegion) { 283 if (!S) 284 return; 285 for (const CapturedStmt::Capture &C : S->captures()) { 286 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 287 const ValueDecl *VD = C.getCapturedVar(); 288 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 289 if (IsCombinedParallelRegion) { 290 // Check if the variable is privatized in the combined construct and 291 // those private copies must be shared in the inner parallel 292 // directive. 293 IsForCombinedParallelRegion = false; 294 for (const OMPClause *C : Clauses) { 295 if (!isOpenMPPrivate(C->getClauseKind()) || 296 C->getClauseKind() == OMPC_reduction || 297 C->getClauseKind() == OMPC_linear || 298 C->getClauseKind() == OMPC_private) 299 continue; 300 ArrayRef<const Expr *> Vars; 301 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 302 Vars = PC->getVarRefs(); 303 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 304 Vars = PC->getVarRefs(); 305 else 306 llvm_unreachable("Unexpected clause."); 307 for (const auto *E : Vars) { 308 const Decl *D = 309 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 310 if (D == VD->getCanonicalDecl()) { 311 IsForCombinedParallelRegion = true; 312 break; 313 } 314 } 315 if (IsForCombinedParallelRegion) 316 break; 317 } 318 } 319 markAsEscaped(VD); 320 if (isa<OMPCapturedExprDecl>(VD)) 321 VisitValueDecl(VD); 322 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 323 } 324 } 325 } 326 327 void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 328 assert(!GlobalizedRD && 329 "Record for globalized variables is built already."); 330 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 331 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 332 if (IsInTTDRegion) 333 EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 334 else 335 EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 336 GlobalizedRD = ::buildRecordForGlobalizedVars( 337 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 338 MappedDeclsFields, WarpSize); 339 } 340 341 public: 342 CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 343 ArrayRef<const ValueDecl *> TeamsReductions) 344 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 345 } 346 virtual ~CheckVarsEscapingDeclContext() = default; 347 void VisitDeclStmt(const DeclStmt *S) { 348 if (!S) 349 return; 350 for (const Decl *D : S->decls()) 351 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 352 VisitValueDecl(VD); 353 } 354 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 355 if (!D) 356 return; 357 if (!D->hasAssociatedStmt()) 358 return; 359 if (const auto *S = 360 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 361 // Do not analyze directives that do not actually require capturing, 362 // like `omp for` or `omp simd` directives. 363 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 364 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 365 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 366 VisitStmt(S->getCapturedStmt()); 367 return; 368 } 369 VisitOpenMPCapturedStmt( 370 S, D->clauses(), 371 CaptureRegions.back() == OMPD_parallel && 372 isOpenMPDistributeDirective(D->getDirectiveKind())); 373 } 374 } 375 void VisitCapturedStmt(const CapturedStmt *S) { 376 if (!S) 377 return; 378 for (const CapturedStmt::Capture &C : S->captures()) { 379 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 380 const ValueDecl *VD = C.getCapturedVar(); 381 markAsEscaped(VD); 382 if (isa<OMPCapturedExprDecl>(VD)) 383 VisitValueDecl(VD); 384 } 385 } 386 } 387 void VisitLambdaExpr(const LambdaExpr *E) { 388 if (!E) 389 return; 390 for (const LambdaCapture &C : E->captures()) { 391 if (C.capturesVariable()) { 392 if (C.getCaptureKind() == LCK_ByRef) { 393 const ValueDecl *VD = C.getCapturedVar(); 394 markAsEscaped(VD); 395 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 396 VisitValueDecl(VD); 397 } 398 } 399 } 400 } 401 void VisitBlockExpr(const BlockExpr *E) { 402 if (!E) 403 return; 404 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 405 if (C.isByRef()) { 406 const VarDecl *VD = C.getVariable(); 407 markAsEscaped(VD); 408 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 409 VisitValueDecl(VD); 410 } 411 } 412 } 413 void VisitCallExpr(const CallExpr *E) { 414 if (!E) 415 return; 416 for (const Expr *Arg : E->arguments()) { 417 if (!Arg) 418 continue; 419 if (Arg->isLValue()) { 420 const bool SavedAllEscaped = AllEscaped; 421 AllEscaped = true; 422 Visit(Arg); 423 AllEscaped = SavedAllEscaped; 424 } else { 425 Visit(Arg); 426 } 427 } 428 Visit(E->getCallee()); 429 } 430 void VisitDeclRefExpr(const DeclRefExpr *E) { 431 if (!E) 432 return; 433 const ValueDecl *VD = E->getDecl(); 434 if (AllEscaped) 435 markAsEscaped(VD); 436 if (isa<OMPCapturedExprDecl>(VD)) 437 VisitValueDecl(VD); 438 else if (VD->isInitCapture()) 439 VisitValueDecl(VD); 440 } 441 void VisitUnaryOperator(const UnaryOperator *E) { 442 if (!E) 443 return; 444 if (E->getOpcode() == UO_AddrOf) { 445 const bool SavedAllEscaped = AllEscaped; 446 AllEscaped = true; 447 Visit(E->getSubExpr()); 448 AllEscaped = SavedAllEscaped; 449 } else { 450 Visit(E->getSubExpr()); 451 } 452 } 453 void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 454 if (!E) 455 return; 456 if (E->getCastKind() == CK_ArrayToPointerDecay) { 457 const bool SavedAllEscaped = AllEscaped; 458 AllEscaped = true; 459 Visit(E->getSubExpr()); 460 AllEscaped = SavedAllEscaped; 461 } else { 462 Visit(E->getSubExpr()); 463 } 464 } 465 void VisitExpr(const Expr *E) { 466 if (!E) 467 return; 468 bool SavedAllEscaped = AllEscaped; 469 if (!E->isLValue()) 470 AllEscaped = false; 471 for (const Stmt *Child : E->children()) 472 if (Child) 473 Visit(Child); 474 AllEscaped = SavedAllEscaped; 475 } 476 void VisitStmt(const Stmt *S) { 477 if (!S) 478 return; 479 for (const Stmt *Child : S->children()) 480 if (Child) 481 Visit(Child); 482 } 483 484 /// Returns the record that handles all the escaped local variables and used 485 /// instead of their original storage. 486 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 487 if (!GlobalizedRD) 488 buildRecordForGlobalizedVars(IsInTTDRegion); 489 return GlobalizedRD; 490 } 491 492 /// Returns the field in the globalized record for the escaped variable. 493 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 494 assert(GlobalizedRD && 495 "Record for globalized variables must be generated already."); 496 return MappedDeclsFields.lookup(VD); 497 } 498 499 /// Returns the list of the escaped local variables/parameters. 500 ArrayRef<const ValueDecl *> getEscapedDecls() const { 501 return EscapedDecls.getArrayRef(); 502 } 503 504 /// Checks if the escaped local variable is actually a parameter passed by 505 /// value. 506 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 507 return EscapedParameters; 508 } 509 510 /// Returns the list of the escaped variables with the variably modified 511 /// types. 512 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 513 return EscapedVariableLengthDecls.getArrayRef(); 514 } 515 516 /// Returns the list of the delayed variables with the variably modified 517 /// types. 518 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { 519 return DelayedVariableLengthDecls.getArrayRef(); 520 } 521 }; 522 } // anonymous namespace 523 524 /// Get the id of the warp in the block. 525 /// We assume that the warp size is 32, which is always the case 526 /// on the NVPTX device, to generate more efficient code. 527 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { 528 CGBuilderTy &Bld = CGF.Builder; 529 unsigned LaneIDBits = 530 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size); 531 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 532 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id"); 533 } 534 535 /// Get the id of the current lane in the Warp. 536 /// We assume that the warp size is 32, which is always the case 537 /// on the NVPTX device, to generate more efficient code. 538 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { 539 CGBuilderTy &Bld = CGF.Builder; 540 unsigned LaneIDBits = 541 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size); 542 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device."); 543 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits); 544 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 545 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask), 546 "nvptx_lane_id"); 547 } 548 549 CGOpenMPRuntimeGPU::ExecutionMode 550 CGOpenMPRuntimeGPU::getExecutionMode() const { 551 return CurrentExecutionMode; 552 } 553 554 static CGOpenMPRuntimeGPU::DataSharingMode 555 getDataSharingMode(CodeGenModule &CGM) { 556 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA 557 : CGOpenMPRuntimeGPU::Generic; 558 } 559 560 /// Check for inner (nested) SPMD construct, if any 561 static bool hasNestedSPMDDirective(ASTContext &Ctx, 562 const OMPExecutableDirective &D) { 563 const auto *CS = D.getInnermostCapturedStmt(); 564 const auto *Body = 565 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 566 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 567 568 if (const auto *NestedDir = 569 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 570 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 571 switch (D.getDirectiveKind()) { 572 case OMPD_target: 573 if (isOpenMPParallelDirective(DKind)) 574 return true; 575 if (DKind == OMPD_teams) { 576 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 577 /*IgnoreCaptured=*/true); 578 if (!Body) 579 return false; 580 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 581 if (const auto *NND = 582 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 583 DKind = NND->getDirectiveKind(); 584 if (isOpenMPParallelDirective(DKind)) 585 return true; 586 } 587 } 588 return false; 589 case OMPD_target_teams: 590 return isOpenMPParallelDirective(DKind); 591 case OMPD_target_simd: 592 case OMPD_target_parallel: 593 case OMPD_target_parallel_for: 594 case OMPD_target_parallel_for_simd: 595 case OMPD_target_teams_distribute: 596 case OMPD_target_teams_distribute_simd: 597 case OMPD_target_teams_distribute_parallel_for: 598 case OMPD_target_teams_distribute_parallel_for_simd: 599 case OMPD_parallel: 600 case OMPD_for: 601 case OMPD_parallel_for: 602 case OMPD_parallel_master: 603 case OMPD_parallel_sections: 604 case OMPD_for_simd: 605 case OMPD_parallel_for_simd: 606 case OMPD_cancel: 607 case OMPD_cancellation_point: 608 case OMPD_ordered: 609 case OMPD_threadprivate: 610 case OMPD_allocate: 611 case OMPD_task: 612 case OMPD_simd: 613 case OMPD_sections: 614 case OMPD_section: 615 case OMPD_single: 616 case OMPD_master: 617 case OMPD_critical: 618 case OMPD_taskyield: 619 case OMPD_barrier: 620 case OMPD_taskwait: 621 case OMPD_taskgroup: 622 case OMPD_atomic: 623 case OMPD_flush: 624 case OMPD_depobj: 625 case OMPD_scan: 626 case OMPD_teams: 627 case OMPD_target_data: 628 case OMPD_target_exit_data: 629 case OMPD_target_enter_data: 630 case OMPD_distribute: 631 case OMPD_distribute_simd: 632 case OMPD_distribute_parallel_for: 633 case OMPD_distribute_parallel_for_simd: 634 case OMPD_teams_distribute: 635 case OMPD_teams_distribute_simd: 636 case OMPD_teams_distribute_parallel_for: 637 case OMPD_teams_distribute_parallel_for_simd: 638 case OMPD_target_update: 639 case OMPD_declare_simd: 640 case OMPD_declare_variant: 641 case OMPD_begin_declare_variant: 642 case OMPD_end_declare_variant: 643 case OMPD_declare_target: 644 case OMPD_end_declare_target: 645 case OMPD_declare_reduction: 646 case OMPD_declare_mapper: 647 case OMPD_taskloop: 648 case OMPD_taskloop_simd: 649 case OMPD_master_taskloop: 650 case OMPD_master_taskloop_simd: 651 case OMPD_parallel_master_taskloop: 652 case OMPD_parallel_master_taskloop_simd: 653 case OMPD_requires: 654 case OMPD_unknown: 655 default: 656 llvm_unreachable("Unexpected directive."); 657 } 658 } 659 660 return false; 661 } 662 663 static bool supportsSPMDExecutionMode(ASTContext &Ctx, 664 const OMPExecutableDirective &D) { 665 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 666 switch (DirectiveKind) { 667 case OMPD_target: 668 case OMPD_target_teams: 669 return hasNestedSPMDDirective(Ctx, D); 670 case OMPD_target_teams_loop: 671 case OMPD_target_parallel_loop: 672 case OMPD_target_parallel: 673 case OMPD_target_parallel_for: 674 case OMPD_target_parallel_for_simd: 675 case OMPD_target_teams_distribute_parallel_for: 676 case OMPD_target_teams_distribute_parallel_for_simd: 677 case OMPD_target_simd: 678 case OMPD_target_teams_distribute_simd: 679 return true; 680 case OMPD_target_teams_distribute: 681 return false; 682 case OMPD_parallel: 683 case OMPD_for: 684 case OMPD_parallel_for: 685 case OMPD_parallel_master: 686 case OMPD_parallel_sections: 687 case OMPD_for_simd: 688 case OMPD_parallel_for_simd: 689 case OMPD_cancel: 690 case OMPD_cancellation_point: 691 case OMPD_ordered: 692 case OMPD_threadprivate: 693 case OMPD_allocate: 694 case OMPD_task: 695 case OMPD_simd: 696 case OMPD_sections: 697 case OMPD_section: 698 case OMPD_single: 699 case OMPD_master: 700 case OMPD_critical: 701 case OMPD_taskyield: 702 case OMPD_barrier: 703 case OMPD_taskwait: 704 case OMPD_taskgroup: 705 case OMPD_atomic: 706 case OMPD_flush: 707 case OMPD_depobj: 708 case OMPD_scan: 709 case OMPD_teams: 710 case OMPD_target_data: 711 case OMPD_target_exit_data: 712 case OMPD_target_enter_data: 713 case OMPD_distribute: 714 case OMPD_distribute_simd: 715 case OMPD_distribute_parallel_for: 716 case OMPD_distribute_parallel_for_simd: 717 case OMPD_teams_distribute: 718 case OMPD_teams_distribute_simd: 719 case OMPD_teams_distribute_parallel_for: 720 case OMPD_teams_distribute_parallel_for_simd: 721 case OMPD_target_update: 722 case OMPD_declare_simd: 723 case OMPD_declare_variant: 724 case OMPD_begin_declare_variant: 725 case OMPD_end_declare_variant: 726 case OMPD_declare_target: 727 case OMPD_end_declare_target: 728 case OMPD_declare_reduction: 729 case OMPD_declare_mapper: 730 case OMPD_taskloop: 731 case OMPD_taskloop_simd: 732 case OMPD_master_taskloop: 733 case OMPD_master_taskloop_simd: 734 case OMPD_parallel_master_taskloop: 735 case OMPD_parallel_master_taskloop_simd: 736 case OMPD_requires: 737 case OMPD_unknown: 738 default: 739 break; 740 } 741 llvm_unreachable( 742 "Unknown programming model for OpenMP directive on NVPTX target."); 743 } 744 745 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 746 StringRef ParentName, 747 llvm::Function *&OutlinedFn, 748 llvm::Constant *&OutlinedFnID, 749 bool IsOffloadEntry, 750 const RegionCodeGenTy &CodeGen) { 751 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); 752 EntryFunctionState EST; 753 WrapperFunctionsMap.clear(); 754 755 // Emit target region as a standalone region. 756 class NVPTXPrePostActionTy : public PrePostActionTy { 757 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 758 759 public: 760 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST) 761 : EST(EST) {} 762 void Enter(CodeGenFunction &CGF) override { 763 auto &RT = 764 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 765 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false); 766 // Skip target region initialization. 767 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 768 } 769 void Exit(CodeGenFunction &CGF) override { 770 auto &RT = 771 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 772 RT.clearLocThreadIdInsertPt(CGF); 773 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); 774 } 775 } Action(EST); 776 CodeGen.setAction(Action); 777 IsInTTDRegion = true; 778 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 779 IsOffloadEntry, CodeGen); 780 IsInTTDRegion = false; 781 } 782 783 void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF, 784 EntryFunctionState &EST, bool IsSPMD) { 785 CGBuilderTy &Bld = CGF.Builder; 786 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD)); 787 if (!IsSPMD) 788 emitGenericVarsProlog(CGF, EST.Loc); 789 } 790 791 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, 792 EntryFunctionState &EST, 793 bool IsSPMD) { 794 if (!IsSPMD) 795 emitGenericVarsEpilog(CGF); 796 797 CGBuilderTy &Bld = CGF.Builder; 798 OMPBuilder.createTargetDeinit(Bld, IsSPMD); 799 } 800 801 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 802 StringRef ParentName, 803 llvm::Function *&OutlinedFn, 804 llvm::Constant *&OutlinedFnID, 805 bool IsOffloadEntry, 806 const RegionCodeGenTy &CodeGen) { 807 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); 808 EntryFunctionState EST; 809 810 // Emit target region as a standalone region. 811 class NVPTXPrePostActionTy : public PrePostActionTy { 812 CGOpenMPRuntimeGPU &RT; 813 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 814 815 public: 816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 817 CGOpenMPRuntimeGPU::EntryFunctionState &EST) 818 : RT(RT), EST(EST) {} 819 void Enter(CodeGenFunction &CGF) override { 820 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true); 821 // Skip target region initialization. 822 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 823 } 824 void Exit(CodeGenFunction &CGF) override { 825 RT.clearLocThreadIdInsertPt(CGF); 826 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); 827 } 828 } Action(*this, EST); 829 CodeGen.setAction(Action); 830 IsInTTDRegion = true; 831 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 832 IsOffloadEntry, CodeGen); 833 IsInTTDRegion = false; 834 } 835 836 // Create a unique global variable to indicate the execution mode of this target 837 // region. The execution mode is either 'generic', or 'spmd' depending on the 838 // target directive. This variable is picked up by the offload library to setup 839 // the device appropriately before kernel launch. If the execution mode is 840 // 'generic', the runtime reserves one warp for the master, otherwise, all 841 // warps participate in parallel work. 842 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, 843 bool Mode) { 844 auto *GVMode = new llvm::GlobalVariable( 845 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, 846 llvm::GlobalValue::WeakAnyLinkage, 847 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD 848 : OMP_TGT_EXEC_MODE_GENERIC), 849 Twine(Name, "_exec_mode")); 850 GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility); 851 CGM.addCompilerUsedGlobal(GVMode); 852 } 853 854 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 855 const OMPExecutableDirective &D, StringRef ParentName, 856 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 857 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 858 if (!IsOffloadEntry) // Nothing to do. 859 return; 860 861 assert(!ParentName.empty() && "Invalid target region parent name!"); 862 863 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 864 if (Mode) 865 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 866 CodeGen); 867 else 868 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 869 CodeGen); 870 871 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); 872 } 873 874 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 875 : CGOpenMPRuntime(CGM) { 876 llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsTargetDevice, 877 isGPU(), hasRequiresUnifiedSharedMemory(), 878 CGM.getLangOpts().OpenMPOffloadMandatory); 879 OMPBuilder.setConfig(Config); 880 881 if (!CGM.getLangOpts().OpenMPIsTargetDevice) 882 llvm_unreachable("OpenMP can only handle device code."); 883 884 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); 885 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) 886 return; 887 888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, 889 "__omp_rtl_debug_kind"); 890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, 891 "__omp_rtl_assume_teams_oversubscription"); 892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, 893 "__omp_rtl_assume_threads_oversubscription"); 894 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, 895 "__omp_rtl_assume_no_thread_state"); 896 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, 897 "__omp_rtl_assume_no_nested_parallelism"); 898 } 899 900 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 901 ProcBindKind ProcBind, 902 SourceLocation Loc) { 903 // Do nothing in case of SPMD mode and L0 parallel. 904 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 905 return; 906 907 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); 908 } 909 910 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 911 llvm::Value *NumThreads, 912 SourceLocation Loc) { 913 // Nothing to do. 914 } 915 916 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 917 const Expr *NumTeams, 918 const Expr *ThreadLimit, 919 SourceLocation Loc) {} 920 921 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 922 CodeGenFunction &CGF, const OMPExecutableDirective &D, 923 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 924 const RegionCodeGenTy &CodeGen) { 925 // Emit target region as a standalone region. 926 bool PrevIsInTTDRegion = IsInTTDRegion; 927 IsInTTDRegion = false; 928 auto *OutlinedFun = 929 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 930 CGF, D, ThreadIDVar, InnermostKind, CodeGen)); 931 IsInTTDRegion = PrevIsInTTDRegion; 932 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { 933 llvm::Function *WrapperFun = 934 createParallelDataSharingWrapper(OutlinedFun, D); 935 WrapperFunctionsMap[OutlinedFun] = WrapperFun; 936 } 937 938 return OutlinedFun; 939 } 940 941 /// Get list of lastprivate variables from the teams distribute ... or 942 /// teams {distribute ...} directives. 943 static void 944 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 945 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 946 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 947 "expected teams directive."); 948 const OMPExecutableDirective *Dir = &D; 949 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 950 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 951 Ctx, 952 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 953 /*IgnoreCaptured=*/true))) { 954 Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 955 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 956 Dir = nullptr; 957 } 958 } 959 if (!Dir) 960 return; 961 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 962 for (const Expr *E : C->getVarRefs()) 963 Vars.push_back(getPrivateItem(E)); 964 } 965 } 966 967 /// Get list of reduction variables from the teams ... directives. 968 static void 969 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 970 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 971 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 972 "expected teams directive."); 973 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 974 for (const Expr *E : C->privates()) 975 Vars.push_back(getPrivateItem(E)); 976 } 977 } 978 979 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 980 CodeGenFunction &CGF, const OMPExecutableDirective &D, 981 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 982 const RegionCodeGenTy &CodeGen) { 983 SourceLocation Loc = D.getBeginLoc(); 984 985 const RecordDecl *GlobalizedRD = nullptr; 986 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 987 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 988 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; 989 // Globalize team reductions variable unconditionally in all modes. 990 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 991 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 992 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 993 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 994 if (!LastPrivatesReductions.empty()) { 995 GlobalizedRD = ::buildRecordForGlobalizedVars( 996 CGM.getContext(), std::nullopt, LastPrivatesReductions, 997 MappedDeclsFields, WarpSize); 998 } 999 } else if (!LastPrivatesReductions.empty()) { 1000 assert(!TeamAndReductions.first && 1001 "Previous team declaration is not expected."); 1002 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 1003 std::swap(TeamAndReductions.second, LastPrivatesReductions); 1004 } 1005 1006 // Emit target region as a standalone region. 1007 class NVPTXPrePostActionTy : public PrePostActionTy { 1008 SourceLocation &Loc; 1009 const RecordDecl *GlobalizedRD; 1010 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1011 &MappedDeclsFields; 1012 1013 public: 1014 NVPTXPrePostActionTy( 1015 SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1016 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1017 &MappedDeclsFields) 1018 : Loc(Loc), GlobalizedRD(GlobalizedRD), 1019 MappedDeclsFields(MappedDeclsFields) {} 1020 void Enter(CodeGenFunction &CGF) override { 1021 auto &Rt = 1022 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1023 if (GlobalizedRD) { 1024 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1025 I->getSecond().MappedParams = 1026 std::make_unique<CodeGenFunction::OMPMapVars>(); 1027 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1028 for (const auto &Pair : MappedDeclsFields) { 1029 assert(Pair.getFirst()->isCanonicalDecl() && 1030 "Expected canonical declaration"); 1031 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); 1032 } 1033 } 1034 Rt.emitGenericVarsProlog(CGF, Loc); 1035 } 1036 void Exit(CodeGenFunction &CGF) override { 1037 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1038 .emitGenericVarsEpilog(CGF); 1039 } 1040 } Action(Loc, GlobalizedRD, MappedDeclsFields); 1041 CodeGen.setAction(Action); 1042 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1043 CGF, D, ThreadIDVar, InnermostKind, CodeGen); 1044 1045 return OutlinedFun; 1046 } 1047 1048 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1049 SourceLocation Loc, 1050 bool WithSPMDCheck) { 1051 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 1052 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1053 return; 1054 1055 CGBuilderTy &Bld = CGF.Builder; 1056 1057 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1058 if (I == FunctionGlobalizedDecls.end()) 1059 return; 1060 1061 for (auto &Rec : I->getSecond().LocalVarData) { 1062 const auto *VD = cast<VarDecl>(Rec.first); 1063 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1064 QualType VarTy = VD->getType(); 1065 1066 // Get the local allocation of a firstprivate variable before sharing 1067 llvm::Value *ParValue; 1068 if (EscapedParam) { 1069 LValue ParLVal = 1070 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1071 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1072 } 1073 1074 // Allocate space for the variable to be globalized 1075 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; 1076 llvm::CallBase *VoidPtr = 1077 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1078 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1079 AllocArgs, VD->getName()); 1080 // FIXME: We should use the variables actual alignment as an argument. 1081 VoidPtr->addRetAttr(llvm::Attribute::get( 1082 CGM.getLLVMContext(), llvm::Attribute::Alignment, 1083 CGM.getContext().getTargetInfo().getNewAlign() / 8)); 1084 1085 // Cast the void pointer and get the address of the globalized variable. 1086 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); 1087 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1088 VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); 1089 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy); 1090 Rec.second.PrivateAddr = VarAddr.getAddress(CGF); 1091 Rec.second.GlobalizedVal = VoidPtr; 1092 1093 // Assign the local allocation to the newly globalized location. 1094 if (EscapedParam) { 1095 CGF.EmitStoreOfScalar(ParValue, VarAddr); 1096 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF)); 1097 } 1098 if (auto *DI = CGF.getDebugInfo()) 1099 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); 1100 } 1101 1102 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { 1103 const auto *VD = cast<VarDecl>(ValueD); 1104 std::pair<llvm::Value *, llvm::Value *> AddrSizePair = 1105 getKmpcAllocShared(CGF, VD); 1106 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); 1107 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), 1108 CGM.getContext().getDeclAlign(VD), 1109 AlignmentSource::Decl); 1110 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress(CGF)); 1111 } 1112 I->getSecond().MappedParams->apply(CGF); 1113 } 1114 1115 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, 1116 const VarDecl *VD) const { 1117 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1118 if (I == FunctionGlobalizedDecls.end()) 1119 return false; 1120 1121 // Check variable declaration is delayed: 1122 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); 1123 } 1124 1125 std::pair<llvm::Value *, llvm::Value *> 1126 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, 1127 const VarDecl *VD) { 1128 CGBuilderTy &Bld = CGF.Builder; 1129 1130 // Compute size and alignment. 1131 llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1132 CharUnits Align = CGM.getContext().getDeclAlign(VD); 1133 Size = Bld.CreateNUWAdd( 1134 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1135 llvm::Value *AlignVal = 1136 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1137 Size = Bld.CreateUDiv(Size, AlignVal); 1138 Size = Bld.CreateNUWMul(Size, AlignVal); 1139 1140 // Allocate space for this VLA object to be globalized. 1141 llvm::Value *AllocArgs[] = {Size}; 1142 llvm::CallBase *VoidPtr = 1143 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1144 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1145 AllocArgs, VD->getName()); 1146 VoidPtr->addRetAttr(llvm::Attribute::get( 1147 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); 1148 1149 return std::make_pair(VoidPtr, Size); 1150 } 1151 1152 void CGOpenMPRuntimeGPU::getKmpcFreeShared( 1153 CodeGenFunction &CGF, 1154 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { 1155 // Deallocate the memory for each globalized VLA object 1156 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1157 CGM.getModule(), OMPRTL___kmpc_free_shared), 1158 {AddrSizePair.first, AddrSizePair.second}); 1159 } 1160 1161 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF, 1162 bool WithSPMDCheck) { 1163 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic && 1164 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 1165 return; 1166 1167 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1168 if (I != FunctionGlobalizedDecls.end()) { 1169 // Deallocate the memory for each globalized VLA object that was 1170 // globalized in the prolog (i.e. emitGenericVarsProlog). 1171 for (const auto &AddrSizePair : 1172 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 1173 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1174 CGM.getModule(), OMPRTL___kmpc_free_shared), 1175 {AddrSizePair.first, AddrSizePair.second}); 1176 } 1177 // Deallocate the memory for each globalized value 1178 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { 1179 const auto *VD = cast<VarDecl>(Rec.first); 1180 I->getSecond().MappedParams->restore(CGF); 1181 1182 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, 1183 CGF.getTypeSize(VD->getType())}; 1184 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1185 CGM.getModule(), OMPRTL___kmpc_free_shared), 1186 FreeArgs); 1187 } 1188 } 1189 } 1190 1191 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 1192 const OMPExecutableDirective &D, 1193 SourceLocation Loc, 1194 llvm::Function *OutlinedFn, 1195 ArrayRef<llvm::Value *> CapturedVars) { 1196 if (!CGF.HaveInsertPoint()) 1197 return; 1198 1199 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1200 /*Name=*/".zero.addr"); 1201 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1202 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 1203 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); 1204 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 1205 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 1206 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 1207 } 1208 1209 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 1210 SourceLocation Loc, 1211 llvm::Function *OutlinedFn, 1212 ArrayRef<llvm::Value *> CapturedVars, 1213 const Expr *IfCond, 1214 llvm::Value *NumThreads) { 1215 if (!CGF.HaveInsertPoint()) 1216 return; 1217 1218 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, 1219 NumThreads](CodeGenFunction &CGF, 1220 PrePostActionTy &Action) { 1221 CGBuilderTy &Bld = CGF.Builder; 1222 llvm::Value *NumThreadsVal = NumThreads; 1223 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 1224 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 1225 if (WFn) 1226 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 1227 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 1228 1229 // Create a private scope that will globalize the arguments 1230 // passed from the outside of the target region. 1231 // TODO: Is that needed? 1232 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 1233 1234 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 1235 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 1236 "captured_vars_addrs"); 1237 // There's something to share. 1238 if (!CapturedVars.empty()) { 1239 // Prepare for parallel region. Indicate the outlined function. 1240 ASTContext &Ctx = CGF.getContext(); 1241 unsigned Idx = 0; 1242 for (llvm::Value *V : CapturedVars) { 1243 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 1244 llvm::Value *PtrV; 1245 if (V->getType()->isIntegerTy()) 1246 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 1247 else 1248 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 1249 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 1250 Ctx.getPointerType(Ctx.VoidPtrTy)); 1251 ++Idx; 1252 } 1253 } 1254 1255 llvm::Value *IfCondVal = nullptr; 1256 if (IfCond) 1257 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 1258 /* isSigned */ false); 1259 else 1260 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 1261 1262 if (!NumThreadsVal) 1263 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); 1264 else 1265 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), 1266 1267 assert(IfCondVal && "Expected a value"); 1268 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1269 llvm::Value *Args[] = { 1270 RTLoc, 1271 getThreadID(CGF, Loc), 1272 IfCondVal, 1273 NumThreadsVal, 1274 llvm::ConstantInt::get(CGF.Int32Ty, -1), 1275 FnPtr, 1276 ID, 1277 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(), 1278 CGF.VoidPtrPtrTy), 1279 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 1280 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1281 CGM.getModule(), OMPRTL___kmpc_parallel_51), 1282 Args); 1283 }; 1284 1285 RegionCodeGenTy RCG(ParallelGen); 1286 RCG(CGF); 1287 } 1288 1289 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 1290 // Always emit simple barriers! 1291 if (!CGF.HaveInsertPoint()) 1292 return; 1293 // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 1294 // This function does not use parameters, so we can emit just default values. 1295 llvm::Value *Args[] = { 1296 llvm::ConstantPointerNull::get( 1297 cast<llvm::PointerType>(getIdentTyPointerTy())), 1298 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 1299 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1300 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 1301 Args); 1302 } 1303 1304 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 1305 SourceLocation Loc, 1306 OpenMPDirectiveKind Kind, bool, 1307 bool) { 1308 // Always emit simple barriers! 1309 if (!CGF.HaveInsertPoint()) 1310 return; 1311 // Build call __kmpc_cancel_barrier(loc, thread_id); 1312 unsigned Flags = getDefaultFlagsForBarriers(Kind); 1313 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 1314 getThreadID(CGF, Loc)}; 1315 1316 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1317 CGM.getModule(), OMPRTL___kmpc_barrier), 1318 Args); 1319 } 1320 1321 void CGOpenMPRuntimeGPU::emitCriticalRegion( 1322 CodeGenFunction &CGF, StringRef CriticalName, 1323 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 1324 const Expr *Hint) { 1325 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 1326 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 1327 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 1328 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 1329 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 1330 1331 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1332 1333 // Get the mask of active threads in the warp. 1334 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1335 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 1336 // Fetch team-local id of the thread. 1337 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1338 1339 // Get the width of the team. 1340 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 1341 1342 // Initialize the counter variable for the loop. 1343 QualType Int32Ty = 1344 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 1345 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 1346 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 1347 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 1348 /*isInit=*/true); 1349 1350 // Block checks if loop counter exceeds upper bound. 1351 CGF.EmitBlock(LoopBB); 1352 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1353 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 1354 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 1355 1356 // Block tests which single thread should execute region, and which threads 1357 // should go straight to synchronisation point. 1358 CGF.EmitBlock(TestBB); 1359 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1360 llvm::Value *CmpThreadToCounter = 1361 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 1362 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 1363 1364 // Block emits the body of the critical region. 1365 CGF.EmitBlock(BodyBB); 1366 1367 // Output the critical statement. 1368 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 1369 Hint); 1370 1371 // After the body surrounded by the critical region, the single executing 1372 // thread will jump to the synchronisation point. 1373 // Block waits for all threads in current team to finish then increments the 1374 // counter variable and returns to the loop. 1375 CGF.EmitBlock(SyncBB); 1376 // Reconverge active threads in the warp. 1377 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1378 CGM.getModule(), OMPRTL___kmpc_syncwarp), 1379 Mask); 1380 1381 llvm::Value *IncCounterVal = 1382 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 1383 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 1384 CGF.EmitBranch(LoopBB); 1385 1386 // Block that is reached when all threads in the team complete the region. 1387 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 1388 } 1389 1390 /// Cast value to the specified type. 1391 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 1392 QualType ValTy, QualType CastTy, 1393 SourceLocation Loc) { 1394 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 1395 "Cast type must sized."); 1396 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 1397 "Val type must sized."); 1398 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 1399 if (ValTy == CastTy) 1400 return Val; 1401 if (CGF.getContext().getTypeSizeInChars(ValTy) == 1402 CGF.getContext().getTypeSizeInChars(CastTy)) 1403 return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 1404 if (CastTy->isIntegerType() && ValTy->isIntegerType()) 1405 return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 1406 CastTy->hasSignedIntegerRepresentation()); 1407 Address CastItem = CGF.CreateMemTemp(CastTy); 1408 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 1409 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()), 1410 Val->getType()); 1411 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 1412 LValueBaseInfo(AlignmentSource::Type), 1413 TBAAAccessInfo()); 1414 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 1415 LValueBaseInfo(AlignmentSource::Type), 1416 TBAAAccessInfo()); 1417 } 1418 1419 /// This function creates calls to one of two shuffle functions to copy 1420 /// variables between lanes in a warp. 1421 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, 1422 llvm::Value *Elem, 1423 QualType ElemType, 1424 llvm::Value *Offset, 1425 SourceLocation Loc) { 1426 CodeGenModule &CGM = CGF.CGM; 1427 CGBuilderTy &Bld = CGF.Builder; 1428 CGOpenMPRuntimeGPU &RT = 1429 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime())); 1430 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder(); 1431 1432 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 1433 assert(Size.getQuantity() <= 8 && 1434 "Unsupported bitwidth in shuffle instruction."); 1435 1436 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4 1437 ? OMPRTL___kmpc_shuffle_int32 1438 : OMPRTL___kmpc_shuffle_int64; 1439 1440 // Cast all types to 32- or 64-bit values before calling shuffle routines. 1441 QualType CastTy = CGF.getContext().getIntTypeForBitwidth( 1442 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1); 1443 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc); 1444 llvm::Value *WarpSize = 1445 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true); 1446 1447 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall( 1448 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn), 1449 {ElemCast, Offset, WarpSize}); 1450 1451 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc); 1452 } 1453 1454 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, 1455 Address DestAddr, QualType ElemType, 1456 llvm::Value *Offset, SourceLocation Loc) { 1457 CGBuilderTy &Bld = CGF.Builder; 1458 1459 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType); 1460 // Create the loop over the big sized data. 1461 // ptr = (void*)Elem; 1462 // ptrEnd = (void*) Elem + 1; 1463 // Step = 8; 1464 // while (ptr + Step < ptrEnd) 1465 // shuffle((int64_t)*ptr); 1466 // Step = 4; 1467 // while (ptr + Step < ptrEnd) 1468 // shuffle((int32_t)*ptr); 1469 // ... 1470 Address ElemPtr = DestAddr; 1471 Address Ptr = SrcAddr; 1472 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast( 1473 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty); 1474 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) { 1475 if (Size < CharUnits::fromQuantity(IntSize)) 1476 continue; 1477 QualType IntType = CGF.getContext().getIntTypeForBitwidth( 1478 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)), 1479 /*Signed=*/1); 1480 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType); 1481 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(), 1482 IntTy); 1483 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1484 ElemPtr, IntTy->getPointerTo(), IntTy); 1485 if (Size.getQuantity() / IntSize > 1) { 1486 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond"); 1487 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then"); 1488 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit"); 1489 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock(); 1490 CGF.EmitBlock(PreCondBB); 1491 llvm::PHINode *PhiSrc = 1492 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2); 1493 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB); 1494 llvm::PHINode *PhiDest = 1495 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2); 1496 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB); 1497 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment()); 1498 ElemPtr = 1499 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment()); 1500 llvm::Value *PtrDiff = Bld.CreatePtrDiff( 1501 CGF.Int8Ty, PtrEnd.getPointer(), 1502 Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(), 1503 CGF.VoidPtrTy)); 1504 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)), 1505 ThenBB, ExitBB); 1506 CGF.EmitBlock(ThenBB); 1507 llvm::Value *Res = createRuntimeShuffleFunction( 1508 CGF, 1509 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 1510 LValueBaseInfo(AlignmentSource::Type), 1511 TBAAAccessInfo()), 1512 IntType, Offset, Loc); 1513 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 1514 LValueBaseInfo(AlignmentSource::Type), 1515 TBAAAccessInfo()); 1516 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1); 1517 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 1518 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB); 1519 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB); 1520 CGF.EmitBranch(PreCondBB); 1521 CGF.EmitBlock(ExitBB); 1522 } else { 1523 llvm::Value *Res = createRuntimeShuffleFunction( 1524 CGF, 1525 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc, 1526 LValueBaseInfo(AlignmentSource::Type), 1527 TBAAAccessInfo()), 1528 IntType, Offset, Loc); 1529 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType, 1530 LValueBaseInfo(AlignmentSource::Type), 1531 TBAAAccessInfo()); 1532 Ptr = Bld.CreateConstGEP(Ptr, 1); 1533 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1); 1534 } 1535 Size = Size % IntSize; 1536 } 1537 } 1538 1539 namespace { 1540 enum CopyAction : unsigned { 1541 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in 1542 // the warp using shuffle instructions. 1543 RemoteLaneToThread, 1544 // ThreadCopy: Make a copy of a Reduce list on the thread's stack. 1545 ThreadCopy, 1546 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. 1547 ThreadToScratchpad, 1548 // ScratchpadToThread: Copy from a scratchpad array in global memory 1549 // containing team-reduced data to a thread's stack. 1550 ScratchpadToThread, 1551 }; 1552 } // namespace 1553 1554 struct CopyOptionsTy { 1555 llvm::Value *RemoteLaneOffset; 1556 llvm::Value *ScratchpadIndex; 1557 llvm::Value *ScratchpadWidth; 1558 }; 1559 1560 /// Emit instructions to copy a Reduce list, which contains partially 1561 /// aggregated values, in the specified direction. 1562 static void emitReductionListCopy( 1563 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, 1564 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase, 1565 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { 1566 1567 CodeGenModule &CGM = CGF.CGM; 1568 ASTContext &C = CGM.getContext(); 1569 CGBuilderTy &Bld = CGF.Builder; 1570 1571 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; 1572 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex; 1573 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth; 1574 1575 // Iterates, element-by-element, through the source Reduce list and 1576 // make a copy. 1577 unsigned Idx = 0; 1578 unsigned Size = Privates.size(); 1579 for (const Expr *Private : Privates) { 1580 Address SrcElementAddr = Address::invalid(); 1581 Address DestElementAddr = Address::invalid(); 1582 Address DestElementPtrAddr = Address::invalid(); 1583 // Should we shuffle in an element from a remote lane? 1584 bool ShuffleInElement = false; 1585 // Set to true to update the pointer in the dest Reduce list to a 1586 // newly created element. 1587 bool UpdateDestListPtr = false; 1588 // Increment the src or dest pointer to the scratchpad, for each 1589 // new element. 1590 bool IncrScratchpadSrc = false; 1591 bool IncrScratchpadDest = false; 1592 QualType PrivatePtrType = C.getPointerType(Private->getType()); 1593 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType); 1594 1595 switch (Action) { 1596 case RemoteLaneToThread: { 1597 // Step 1.1: Get the address for the src element in the Reduce list. 1598 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 1599 SrcElementAddr = CGF.EmitLoadOfPointer( 1600 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), 1601 PrivatePtrType->castAs<PointerType>()); 1602 1603 // Step 1.2: Create a temporary to store the element in the destination 1604 // Reduce list. 1605 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 1606 DestElementAddr = 1607 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 1608 ShuffleInElement = true; 1609 UpdateDestListPtr = true; 1610 break; 1611 } 1612 case ThreadCopy: { 1613 // Step 1.1: Get the address for the src element in the Reduce list. 1614 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 1615 SrcElementAddr = CGF.EmitLoadOfPointer( 1616 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), 1617 PrivatePtrType->castAs<PointerType>()); 1618 1619 // Step 1.2: Get the address for dest element. The destination 1620 // element has already been created on the thread's stack. 1621 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 1622 DestElementAddr = CGF.EmitLoadOfPointer( 1623 DestElementPtrAddr.withElementType(PrivateLlvmPtrType), 1624 PrivatePtrType->castAs<PointerType>()); 1625 break; 1626 } 1627 case ThreadToScratchpad: { 1628 // Step 1.1: Get the address for the src element in the Reduce list. 1629 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx); 1630 SrcElementAddr = CGF.EmitLoadOfPointer( 1631 SrcElementPtrAddr.withElementType(PrivateLlvmPtrType), 1632 PrivatePtrType->castAs<PointerType>()); 1633 1634 // Step 1.2: Get the address for dest element: 1635 // address = base + index * ElementSizeInChars. 1636 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 1637 llvm::Value *CurrentOffset = 1638 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 1639 llvm::Value *ScratchPadElemAbsolutePtrVal = 1640 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset); 1641 ScratchPadElemAbsolutePtrVal = 1642 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 1643 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty, 1644 C.getTypeAlignInChars(Private->getType())); 1645 IncrScratchpadDest = true; 1646 break; 1647 } 1648 case ScratchpadToThread: { 1649 // Step 1.1: Get the address for the src element in the scratchpad. 1650 // address = base + index * ElementSizeInChars. 1651 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 1652 llvm::Value *CurrentOffset = 1653 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex); 1654 llvm::Value *ScratchPadElemAbsolutePtrVal = 1655 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset); 1656 ScratchPadElemAbsolutePtrVal = 1657 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); 1658 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty, 1659 C.getTypeAlignInChars(Private->getType())); 1660 IncrScratchpadSrc = true; 1661 1662 // Step 1.2: Create a temporary to store the element in the destination 1663 // Reduce list. 1664 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx); 1665 DestElementAddr = 1666 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); 1667 UpdateDestListPtr = true; 1668 break; 1669 } 1670 } 1671 1672 // Regardless of src and dest of copy, we emit the load of src 1673 // element as this is required in all directions 1674 SrcElementAddr = SrcElementAddr.withElementType( 1675 CGF.ConvertTypeForMem(Private->getType())); 1676 DestElementAddr = 1677 DestElementAddr.withElementType(SrcElementAddr.getElementType()); 1678 1679 // Now that all active lanes have read the element in the 1680 // Reduce list, shuffle over the value from the remote lane. 1681 if (ShuffleInElement) { 1682 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), 1683 RemoteLaneOffset, Private->getExprLoc()); 1684 } else { 1685 switch (CGF.getEvaluationKind(Private->getType())) { 1686 case TEK_Scalar: { 1687 llvm::Value *Elem = CGF.EmitLoadOfScalar( 1688 SrcElementAddr, /*Volatile=*/false, Private->getType(), 1689 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type), 1690 TBAAAccessInfo()); 1691 // Store the source element value to the dest element address. 1692 CGF.EmitStoreOfScalar( 1693 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(), 1694 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 1695 break; 1696 } 1697 case TEK_Complex: { 1698 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( 1699 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 1700 Private->getExprLoc()); 1701 CGF.EmitStoreOfComplex( 1702 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 1703 /*isInit=*/false); 1704 break; 1705 } 1706 case TEK_Aggregate: 1707 CGF.EmitAggregateCopy( 1708 CGF.MakeAddrLValue(DestElementAddr, Private->getType()), 1709 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), 1710 Private->getType(), AggValueSlot::DoesNotOverlap); 1711 break; 1712 } 1713 } 1714 1715 // Step 3.1: Modify reference in dest Reduce list as needed. 1716 // Modifying the reference in Reduce list to point to the newly 1717 // created element. The element is live in the current function 1718 // scope and that of functions it invokes (i.e., reduce_function). 1719 // RemoteReduceData[i] = (void*)&RemoteElem 1720 if (UpdateDestListPtr) { 1721 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( 1722 DestElementAddr.getPointer(), CGF.VoidPtrTy), 1723 DestElementPtrAddr, /*Volatile=*/false, 1724 C.VoidPtrTy); 1725 } 1726 1727 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting 1728 // address of the next element in scratchpad memory, unless we're currently 1729 // processing the last one. Memory alignment is also taken care of here. 1730 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { 1731 // FIXME: This code doesn't make any sense, it's trying to perform 1732 // integer arithmetic on pointers. 1733 llvm::Value *ScratchpadBasePtr = 1734 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); 1735 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType()); 1736 ScratchpadBasePtr = Bld.CreateNUWAdd( 1737 ScratchpadBasePtr, 1738 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars)); 1739 1740 // Take care of global memory alignment for performance 1741 ScratchpadBasePtr = Bld.CreateNUWSub( 1742 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 1743 ScratchpadBasePtr = Bld.CreateUDiv( 1744 ScratchpadBasePtr, 1745 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 1746 ScratchpadBasePtr = Bld.CreateNUWAdd( 1747 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1)); 1748 ScratchpadBasePtr = Bld.CreateNUWMul( 1749 ScratchpadBasePtr, 1750 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); 1751 1752 if (IncrScratchpadDest) 1753 DestBase = 1754 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign()); 1755 else /* IncrScratchpadSrc = true */ 1756 SrcBase = 1757 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign()); 1758 } 1759 1760 ++Idx; 1761 } 1762 } 1763 1764 /// This function emits a helper that gathers Reduce lists from the first 1765 /// lane of every active warp to lanes in the first warp. 1766 /// 1767 /// void inter_warp_copy_func(void* reduce_data, num_warps) 1768 /// shared smem[warp_size]; 1769 /// For all data entries D in reduce_data: 1770 /// sync 1771 /// If (I am the first lane in each warp) 1772 /// Copy my local D to smem[warp_id] 1773 /// sync 1774 /// if (I am the first warp) 1775 /// Copy smem[thread_id] to my local D 1776 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, 1777 ArrayRef<const Expr *> Privates, 1778 QualType ReductionArrayTy, 1779 SourceLocation Loc) { 1780 ASTContext &C = CGM.getContext(); 1781 llvm::Module &M = CGM.getModule(); 1782 1783 // ReduceList: thread local Reduce list. 1784 // At the stage of the computation when this function is called, partially 1785 // aggregated values reside in the first lane of every active warp. 1786 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 1787 C.VoidPtrTy, ImplicitParamDecl::Other); 1788 // NumWarps: number of warps active in the parallel region. This could 1789 // be smaller than 32 (max warps in a CTA) for partial block reduction. 1790 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 1791 C.getIntTypeForBitwidth(32, /* Signed */ true), 1792 ImplicitParamDecl::Other); 1793 FunctionArgList Args; 1794 Args.push_back(&ReduceListArg); 1795 Args.push_back(&NumWarpsArg); 1796 1797 const CGFunctionInfo &CGFI = 1798 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 1799 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), 1800 llvm::GlobalValue::InternalLinkage, 1801 "_omp_reduction_inter_warp_copy_func", &M); 1802 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 1803 Fn->setDoesNotRecurse(); 1804 CodeGenFunction CGF(CGM); 1805 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 1806 1807 CGBuilderTy &Bld = CGF.Builder; 1808 1809 // This array is used as a medium to transfer, one reduce element at a time, 1810 // the data from the first lane of every warp to lanes in the first warp 1811 // in order to perform the final step of a reduction in a parallel region 1812 // (reduction across warps). The array is placed in NVPTX __shared__ memory 1813 // for reduced latency, as well as to have a distinct copy for concurrently 1814 // executing target regions. The array is declared with common linkage so 1815 // as to be shared across compilation units. 1816 StringRef TransferMediumName = 1817 "__openmp_nvptx_data_transfer_temporary_storage"; 1818 llvm::GlobalVariable *TransferMedium = 1819 M.getGlobalVariable(TransferMediumName); 1820 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 1821 if (!TransferMedium) { 1822 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize); 1823 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); 1824 TransferMedium = new llvm::GlobalVariable( 1825 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage, 1826 llvm::UndefValue::get(Ty), TransferMediumName, 1827 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, 1828 SharedAddressSpace); 1829 CGM.addCompilerUsedGlobal(TransferMedium); 1830 } 1831 1832 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1833 // Get the CUDA thread id of the current OpenMP thread on the GPU. 1834 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1835 // nvptx_lane_id = nvptx_id % warpsize 1836 llvm::Value *LaneID = getNVPTXLaneID(CGF); 1837 // nvptx_warp_id = nvptx_id / warpsize 1838 llvm::Value *WarpID = getNVPTXWarpID(CGF); 1839 1840 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 1841 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy); 1842 Address LocalReduceList( 1843 Bld.CreatePointerBitCastOrAddrSpaceCast( 1844 CGF.EmitLoadOfScalar( 1845 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc, 1846 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()), 1847 ElemTy->getPointerTo()), 1848 ElemTy, CGF.getPointerAlign()); 1849 1850 unsigned Idx = 0; 1851 for (const Expr *Private : Privates) { 1852 // 1853 // Warp master copies reduce element to transfer medium in __shared__ 1854 // memory. 1855 // 1856 unsigned RealTySize = 1857 C.getTypeSizeInChars(Private->getType()) 1858 .alignTo(C.getTypeAlignInChars(Private->getType())) 1859 .getQuantity(); 1860 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) { 1861 unsigned NumIters = RealTySize / TySize; 1862 if (NumIters == 0) 1863 continue; 1864 QualType CType = C.getIntTypeForBitwidth( 1865 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1); 1866 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType); 1867 CharUnits Align = CharUnits::fromQuantity(TySize); 1868 llvm::Value *Cnt = nullptr; 1869 Address CntAddr = Address::invalid(); 1870 llvm::BasicBlock *PrecondBB = nullptr; 1871 llvm::BasicBlock *ExitBB = nullptr; 1872 if (NumIters > 1) { 1873 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr"); 1874 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr, 1875 /*Volatile=*/false, C.IntTy); 1876 PrecondBB = CGF.createBasicBlock("precond"); 1877 ExitBB = CGF.createBasicBlock("exit"); 1878 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body"); 1879 // There is no need to emit line number for unconditional branch. 1880 (void)ApplyDebugLocation::CreateEmpty(CGF); 1881 CGF.EmitBlock(PrecondBB); 1882 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc); 1883 llvm::Value *Cmp = 1884 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters)); 1885 Bld.CreateCondBr(Cmp, BodyBB, ExitBB); 1886 CGF.EmitBlock(BodyBB); 1887 } 1888 // kmpc_barrier. 1889 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 1890 /*EmitChecks=*/false, 1891 /*ForceSimpleCall=*/true); 1892 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 1893 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 1894 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 1895 1896 // if (lane_id == 0) 1897 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master"); 1898 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); 1899 CGF.EmitBlock(ThenBB); 1900 1901 // Reduce element = LocalReduceList[i] 1902 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 1903 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 1904 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 1905 // elemptr = ((CopyType*)(elemptrptr)) + I 1906 Address ElemPtr(ElemPtrPtr, CopyType, Align); 1907 if (NumIters > 1) 1908 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt); 1909 1910 // Get pointer to location in transfer medium. 1911 // MediumPtr = &medium[warp_id] 1912 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( 1913 TransferMedium->getValueType(), TransferMedium, 1914 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); 1915 // Casting to actual data type. 1916 // MediumPtr = (CopyType*)MediumPtrAddr; 1917 Address MediumPtr( 1918 Bld.CreateBitCast( 1919 MediumPtrVal, 1920 CopyType->getPointerTo( 1921 MediumPtrVal->getType()->getPointerAddressSpace())), 1922 CopyType, Align); 1923 1924 // elem = *elemptr 1925 //*MediumPtr = elem 1926 llvm::Value *Elem = CGF.EmitLoadOfScalar( 1927 ElemPtr, /*Volatile=*/false, CType, Loc, 1928 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 1929 // Store the source element value to the dest element address. 1930 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType, 1931 LValueBaseInfo(AlignmentSource::Type), 1932 TBAAAccessInfo()); 1933 1934 Bld.CreateBr(MergeBB); 1935 1936 CGF.EmitBlock(ElseBB); 1937 Bld.CreateBr(MergeBB); 1938 1939 CGF.EmitBlock(MergeBB); 1940 1941 // kmpc_barrier. 1942 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown, 1943 /*EmitChecks=*/false, 1944 /*ForceSimpleCall=*/true); 1945 1946 // 1947 // Warp 0 copies reduce element from transfer medium. 1948 // 1949 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); 1950 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); 1951 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); 1952 1953 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); 1954 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( 1955 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc); 1956 1957 // Up to 32 threads in warp 0 are active. 1958 llvm::Value *IsActiveThread = 1959 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); 1960 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); 1961 1962 CGF.EmitBlock(W0ThenBB); 1963 1964 // SrcMediumPtr = &medium[tid] 1965 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( 1966 TransferMedium->getValueType(), TransferMedium, 1967 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); 1968 // SrcMediumVal = *SrcMediumPtr; 1969 Address SrcMediumPtr( 1970 Bld.CreateBitCast( 1971 SrcMediumPtrVal, 1972 CopyType->getPointerTo( 1973 SrcMediumPtrVal->getType()->getPointerAddressSpace())), 1974 CopyType, Align); 1975 1976 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I 1977 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 1978 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( 1979 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc); 1980 Address TargetElemPtr(TargetElemPtrVal, CopyType, Align); 1981 if (NumIters > 1) 1982 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt); 1983 1984 // *TargetElemPtr = SrcMediumVal; 1985 llvm::Value *SrcMediumValue = 1986 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc); 1987 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, 1988 CType); 1989 Bld.CreateBr(W0MergeBB); 1990 1991 CGF.EmitBlock(W0ElseBB); 1992 Bld.CreateBr(W0MergeBB); 1993 1994 CGF.EmitBlock(W0MergeBB); 1995 1996 if (NumIters > 1) { 1997 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1)); 1998 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy); 1999 CGF.EmitBranch(PrecondBB); 2000 (void)ApplyDebugLocation::CreateEmpty(CGF); 2001 CGF.EmitBlock(ExitBB); 2002 } 2003 RealTySize %= TySize; 2004 } 2005 ++Idx; 2006 } 2007 2008 CGF.FinishFunction(); 2009 return Fn; 2010 } 2011 2012 /// Emit a helper that reduces data across two OpenMP threads (lanes) 2013 /// in the same warp. It uses shuffle instructions to copy over data from 2014 /// a remote lane's stack. The reduction algorithm performed is specified 2015 /// by the fourth parameter. 2016 /// 2017 /// Algorithm Versions. 2018 /// Full Warp Reduce (argument value 0): 2019 /// This algorithm assumes that all 32 lanes are active and gathers 2020 /// data from these 32 lanes, producing a single resultant value. 2021 /// Contiguous Partial Warp Reduce (argument value 1): 2022 /// This algorithm assumes that only a *contiguous* subset of lanes 2023 /// are active. This happens for the last warp in a parallel region 2024 /// when the user specified num_threads is not an integer multiple of 2025 /// 32. This contiguous subset always starts with the zeroth lane. 2026 /// Partial Warp Reduce (argument value 2): 2027 /// This algorithm gathers data from any number of lanes at any position. 2028 /// All reduced values are stored in the lowest possible lane. The set 2029 /// of problems every algorithm addresses is a super set of those 2030 /// addressable by algorithms with a lower version number. Overhead 2031 /// increases as algorithm version increases. 2032 /// 2033 /// Terminology 2034 /// Reduce element: 2035 /// Reduce element refers to the individual data field with primitive 2036 /// data types to be combined and reduced across threads. 2037 /// Reduce list: 2038 /// Reduce list refers to a collection of local, thread-private 2039 /// reduce elements. 2040 /// Remote Reduce list: 2041 /// Remote Reduce list refers to a collection of remote (relative to 2042 /// the current thread) reduce elements. 2043 /// 2044 /// We distinguish between three states of threads that are important to 2045 /// the implementation of this function. 2046 /// Alive threads: 2047 /// Threads in a warp executing the SIMT instruction, as distinguished from 2048 /// threads that are inactive due to divergent control flow. 2049 /// Active threads: 2050 /// The minimal set of threads that has to be alive upon entry to this 2051 /// function. The computation is correct iff active threads are alive. 2052 /// Some threads are alive but they are not active because they do not 2053 /// contribute to the computation in any useful manner. Turning them off 2054 /// may introduce control flow overheads without any tangible benefits. 2055 /// Effective threads: 2056 /// In order to comply with the argument requirements of the shuffle 2057 /// function, we must keep all lanes holding data alive. But at most 2058 /// half of them perform value aggregation; we refer to this half of 2059 /// threads as effective. The other half is simply handing off their 2060 /// data. 2061 /// 2062 /// Procedure 2063 /// Value shuffle: 2064 /// In this step active threads transfer data from higher lane positions 2065 /// in the warp to lower lane positions, creating Remote Reduce list. 2066 /// Value aggregation: 2067 /// In this step, effective threads combine their thread local Reduce list 2068 /// with Remote Reduce list and store the result in the thread local 2069 /// Reduce list. 2070 /// Value copy: 2071 /// In this step, we deal with the assumption made by algorithm 2 2072 /// (i.e. contiguity assumption). When we have an odd number of lanes 2073 /// active, say 2k+1, only k threads will be effective and therefore k 2074 /// new values will be produced. However, the Reduce list owned by the 2075 /// (2k+1)th thread is ignored in the value aggregation. Therefore 2076 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so 2077 /// that the contiguity assumption still holds. 2078 static llvm::Function *emitShuffleAndReduceFunction( 2079 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2080 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) { 2081 ASTContext &C = CGM.getContext(); 2082 2083 // Thread local Reduce list used to host the values of data to be reduced. 2084 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2085 C.VoidPtrTy, ImplicitParamDecl::Other); 2086 // Current lane id; could be logical. 2087 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy, 2088 ImplicitParamDecl::Other); 2089 // Offset of the remote source lane relative to the current lane. 2090 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2091 C.ShortTy, ImplicitParamDecl::Other); 2092 // Algorithm version. This is expected to be known at compile time. 2093 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2094 C.ShortTy, ImplicitParamDecl::Other); 2095 FunctionArgList Args; 2096 Args.push_back(&ReduceListArg); 2097 Args.push_back(&LaneIDArg); 2098 Args.push_back(&RemoteLaneOffsetArg); 2099 Args.push_back(&AlgoVerArg); 2100 2101 const CGFunctionInfo &CGFI = 2102 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2103 auto *Fn = llvm::Function::Create( 2104 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2105 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); 2106 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2107 Fn->setDoesNotRecurse(); 2108 2109 CodeGenFunction CGF(CGM); 2110 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2111 2112 CGBuilderTy &Bld = CGF.Builder; 2113 2114 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2115 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy); 2116 Address LocalReduceList( 2117 Bld.CreatePointerBitCastOrAddrSpaceCast( 2118 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 2119 C.VoidPtrTy, SourceLocation()), 2120 ElemTy->getPointerTo()), 2121 ElemTy, CGF.getPointerAlign()); 2122 2123 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); 2124 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( 2125 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2126 2127 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); 2128 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( 2129 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2130 2131 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); 2132 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( 2133 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); 2134 2135 // Create a local thread-private variable to host the Reduce list 2136 // from a remote lane. 2137 Address RemoteReduceList = 2138 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); 2139 2140 // This loop iterates through the list of reduce elements and copies, 2141 // element by element, from a remote lane in the warp to RemoteReduceList, 2142 // hosted on the thread's stack. 2143 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, 2144 LocalReduceList, RemoteReduceList, 2145 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal, 2146 /*ScratchpadIndex=*/nullptr, 2147 /*ScratchpadWidth=*/nullptr}); 2148 2149 // The actions to be performed on the Remote Reduce list is dependent 2150 // on the algorithm version. 2151 // 2152 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && 2153 // LaneId % 2 == 0 && Offset > 0): 2154 // do the reduction value aggregation 2155 // 2156 // The thread local variable Reduce list is mutated in place to host the 2157 // reduced data, which is the aggregated value produced from local and 2158 // remote lanes. 2159 // 2160 // Note that AlgoVer is expected to be a constant integer known at compile 2161 // time. 2162 // When AlgoVer==0, the first conjunction evaluates to true, making 2163 // the entire predicate true during compile time. 2164 // When AlgoVer==1, the second conjunction has only the second part to be 2165 // evaluated during runtime. Other conjunctions evaluates to false 2166 // during compile time. 2167 // When AlgoVer==2, the third conjunction has only the second part to be 2168 // evaluated during runtime. Other conjunctions evaluates to false 2169 // during compile time. 2170 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal); 2171 2172 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 2173 llvm::Value *CondAlgo1 = Bld.CreateAnd( 2174 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); 2175 2176 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)); 2177 llvm::Value *CondAlgo2 = Bld.CreateAnd( 2178 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)))); 2179 CondAlgo2 = Bld.CreateAnd( 2180 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); 2181 2182 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); 2183 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); 2184 2185 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); 2186 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); 2187 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); 2188 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); 2189 2190 CGF.EmitBlock(ThenBB); 2191 // reduce_function(LocalReduceList, RemoteReduceList) 2192 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2193 LocalReduceList.getPointer(), CGF.VoidPtrTy); 2194 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2195 RemoteReduceList.getPointer(), CGF.VoidPtrTy); 2196 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 2197 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); 2198 Bld.CreateBr(MergeBB); 2199 2200 CGF.EmitBlock(ElseBB); 2201 Bld.CreateBr(MergeBB); 2202 2203 CGF.EmitBlock(MergeBB); 2204 2205 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local 2206 // Reduce list. 2207 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)); 2208 llvm::Value *CondCopy = Bld.CreateAnd( 2209 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); 2210 2211 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); 2212 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); 2213 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); 2214 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); 2215 2216 CGF.EmitBlock(CpyThenBB); 2217 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, 2218 RemoteReduceList, LocalReduceList); 2219 Bld.CreateBr(CpyMergeBB); 2220 2221 CGF.EmitBlock(CpyElseBB); 2222 Bld.CreateBr(CpyMergeBB); 2223 2224 CGF.EmitBlock(CpyMergeBB); 2225 2226 CGF.FinishFunction(); 2227 return Fn; 2228 } 2229 2230 /// This function emits a helper that copies all the reduction variables from 2231 /// the team into the provided global buffer for the reduction variables. 2232 /// 2233 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 2234 /// For all data entries D in reduce_data: 2235 /// Copy local D to buffer.D[Idx] 2236 static llvm::Value *emitListToGlobalCopyFunction( 2237 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2238 QualType ReductionArrayTy, SourceLocation Loc, 2239 const RecordDecl *TeamReductionRec, 2240 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 2241 &VarFieldMap) { 2242 ASTContext &C = CGM.getContext(); 2243 2244 // Buffer: global reduction buffer. 2245 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2246 C.VoidPtrTy, ImplicitParamDecl::Other); 2247 // Idx: index of the buffer. 2248 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 2249 ImplicitParamDecl::Other); 2250 // ReduceList: thread local Reduce list. 2251 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2252 C.VoidPtrTy, ImplicitParamDecl::Other); 2253 FunctionArgList Args; 2254 Args.push_back(&BufferArg); 2255 Args.push_back(&IdxArg); 2256 Args.push_back(&ReduceListArg); 2257 2258 const CGFunctionInfo &CGFI = 2259 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2260 auto *Fn = llvm::Function::Create( 2261 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2262 "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); 2263 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2264 Fn->setDoesNotRecurse(); 2265 CodeGenFunction CGF(CGM); 2266 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2267 2268 CGBuilderTy &Bld = CGF.Builder; 2269 2270 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2271 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 2272 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy); 2273 Address LocalReduceList( 2274 Bld.CreatePointerBitCastOrAddrSpaceCast( 2275 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 2276 C.VoidPtrTy, Loc), 2277 ElemTy->getPointerTo()), 2278 ElemTy, CGF.getPointerAlign()); 2279 QualType StaticTy = C.getRecordType(TeamReductionRec); 2280 llvm::Type *LLVMReductionsBufferTy = 2281 CGM.getTypes().ConvertTypeForMem(StaticTy); 2282 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2283 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 2284 LLVMReductionsBufferTy->getPointerTo()); 2285 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 2286 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 2287 /*Volatile=*/false, C.IntTy, 2288 Loc)}; 2289 unsigned Idx = 0; 2290 for (const Expr *Private : Privates) { 2291 // Reduce element = LocalReduceList[i] 2292 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2293 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 2294 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 2295 // elemptr = ((CopyType*)(elemptrptr)) + I 2296 ElemTy = CGF.ConvertTypeForMem(Private->getType()); 2297 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2298 ElemPtrPtr, ElemTy->getPointerTo()); 2299 Address ElemPtr = 2300 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType())); 2301 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 2302 // Global = Buffer.VD[Idx]; 2303 const FieldDecl *FD = VarFieldMap.lookup(VD); 2304 LValue GlobLVal = CGF.EmitLValueForField( 2305 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 2306 Address GlobAddr = GlobLVal.getAddress(CGF); 2307 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(), 2308 GlobAddr.getPointer(), Idxs); 2309 GlobLVal.setAddress(Address(BufferPtr, 2310 CGF.ConvertTypeForMem(Private->getType()), 2311 GlobAddr.getAlignment())); 2312 switch (CGF.getEvaluationKind(Private->getType())) { 2313 case TEK_Scalar: { 2314 llvm::Value *V = CGF.EmitLoadOfScalar( 2315 ElemPtr, /*Volatile=*/false, Private->getType(), Loc, 2316 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()); 2317 CGF.EmitStoreOfScalar(V, GlobLVal); 2318 break; 2319 } 2320 case TEK_Complex: { 2321 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( 2322 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); 2323 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); 2324 break; 2325 } 2326 case TEK_Aggregate: 2327 CGF.EmitAggregateCopy(GlobLVal, 2328 CGF.MakeAddrLValue(ElemPtr, Private->getType()), 2329 Private->getType(), AggValueSlot::DoesNotOverlap); 2330 break; 2331 } 2332 ++Idx; 2333 } 2334 2335 CGF.FinishFunction(); 2336 return Fn; 2337 } 2338 2339 /// This function emits a helper that reduces all the reduction variables from 2340 /// the team into the provided global buffer for the reduction variables. 2341 /// 2342 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) 2343 /// void *GlobPtrs[]; 2344 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 2345 /// ... 2346 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 2347 /// reduce_function(GlobPtrs, reduce_data); 2348 static llvm::Value *emitListToGlobalReduceFunction( 2349 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2350 QualType ReductionArrayTy, SourceLocation Loc, 2351 const RecordDecl *TeamReductionRec, 2352 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 2353 &VarFieldMap, 2354 llvm::Function *ReduceFn) { 2355 ASTContext &C = CGM.getContext(); 2356 2357 // Buffer: global reduction buffer. 2358 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2359 C.VoidPtrTy, ImplicitParamDecl::Other); 2360 // Idx: index of the buffer. 2361 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 2362 ImplicitParamDecl::Other); 2363 // ReduceList: thread local Reduce list. 2364 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2365 C.VoidPtrTy, ImplicitParamDecl::Other); 2366 FunctionArgList Args; 2367 Args.push_back(&BufferArg); 2368 Args.push_back(&IdxArg); 2369 Args.push_back(&ReduceListArg); 2370 2371 const CGFunctionInfo &CGFI = 2372 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2373 auto *Fn = llvm::Function::Create( 2374 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2375 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); 2376 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2377 Fn->setDoesNotRecurse(); 2378 CodeGenFunction CGF(CGM); 2379 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2380 2381 CGBuilderTy &Bld = CGF.Builder; 2382 2383 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 2384 QualType StaticTy = C.getRecordType(TeamReductionRec); 2385 llvm::Type *LLVMReductionsBufferTy = 2386 CGM.getTypes().ConvertTypeForMem(StaticTy); 2387 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2388 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 2389 LLVMReductionsBufferTy->getPointerTo()); 2390 2391 // 1. Build a list of reduction variables. 2392 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 2393 Address ReductionList = 2394 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 2395 auto IPriv = Privates.begin(); 2396 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 2397 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 2398 /*Volatile=*/false, C.IntTy, 2399 Loc)}; 2400 unsigned Idx = 0; 2401 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 2402 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2403 // Global = Buffer.VD[Idx]; 2404 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 2405 const FieldDecl *FD = VarFieldMap.lookup(VD); 2406 LValue GlobLVal = CGF.EmitLValueForField( 2407 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 2408 Address GlobAddr = GlobLVal.getAddress(CGF); 2409 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 2410 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 2411 CGF.EmitStoreOfScalar(BufferPtr, Elem, /*Volatile=*/false, C.VoidPtrTy); 2412 if ((*IPriv)->getType()->isVariablyModifiedType()) { 2413 // Store array size. 2414 ++Idx; 2415 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2416 llvm::Value *Size = CGF.Builder.CreateIntCast( 2417 CGF.getVLASize( 2418 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 2419 .NumElts, 2420 CGF.SizeTy, /*isSigned=*/false); 2421 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 2422 Elem); 2423 } 2424 } 2425 2426 // Call reduce_function(GlobalReduceList, ReduceList) 2427 llvm::Value *GlobalReduceList = ReductionList.getPointer(); 2428 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2429 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 2430 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 2431 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 2432 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); 2433 CGF.FinishFunction(); 2434 return Fn; 2435 } 2436 2437 /// This function emits a helper that copies all the reduction variables from 2438 /// the team into the provided global buffer for the reduction variables. 2439 /// 2440 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) 2441 /// For all data entries D in reduce_data: 2442 /// Copy buffer.D[Idx] to local D; 2443 static llvm::Value *emitGlobalToListCopyFunction( 2444 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2445 QualType ReductionArrayTy, SourceLocation Loc, 2446 const RecordDecl *TeamReductionRec, 2447 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 2448 &VarFieldMap) { 2449 ASTContext &C = CGM.getContext(); 2450 2451 // Buffer: global reduction buffer. 2452 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2453 C.VoidPtrTy, ImplicitParamDecl::Other); 2454 // Idx: index of the buffer. 2455 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 2456 ImplicitParamDecl::Other); 2457 // ReduceList: thread local Reduce list. 2458 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2459 C.VoidPtrTy, ImplicitParamDecl::Other); 2460 FunctionArgList Args; 2461 Args.push_back(&BufferArg); 2462 Args.push_back(&IdxArg); 2463 Args.push_back(&ReduceListArg); 2464 2465 const CGFunctionInfo &CGFI = 2466 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2467 auto *Fn = llvm::Function::Create( 2468 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2469 "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); 2470 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2471 Fn->setDoesNotRecurse(); 2472 CodeGenFunction CGF(CGM); 2473 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2474 2475 CGBuilderTy &Bld = CGF.Builder; 2476 2477 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2478 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 2479 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy); 2480 Address LocalReduceList( 2481 Bld.CreatePointerBitCastOrAddrSpaceCast( 2482 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, 2483 C.VoidPtrTy, Loc), 2484 ElemTy->getPointerTo()), 2485 ElemTy, CGF.getPointerAlign()); 2486 QualType StaticTy = C.getRecordType(TeamReductionRec); 2487 llvm::Type *LLVMReductionsBufferTy = 2488 CGM.getTypes().ConvertTypeForMem(StaticTy); 2489 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2490 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 2491 LLVMReductionsBufferTy->getPointerTo()); 2492 2493 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 2494 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 2495 /*Volatile=*/false, C.IntTy, 2496 Loc)}; 2497 unsigned Idx = 0; 2498 for (const Expr *Private : Privates) { 2499 // Reduce element = LocalReduceList[i] 2500 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); 2501 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( 2502 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); 2503 // elemptr = ((CopyType*)(elemptrptr)) + I 2504 ElemTy = CGF.ConvertTypeForMem(Private->getType()); 2505 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2506 ElemPtrPtr, ElemTy->getPointerTo()); 2507 Address ElemPtr = 2508 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType())); 2509 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl(); 2510 // Global = Buffer.VD[Idx]; 2511 const FieldDecl *FD = VarFieldMap.lookup(VD); 2512 LValue GlobLVal = CGF.EmitLValueForField( 2513 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 2514 Address GlobAddr = GlobLVal.getAddress(CGF); 2515 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(), 2516 GlobAddr.getPointer(), Idxs); 2517 GlobLVal.setAddress(Address(BufferPtr, 2518 CGF.ConvertTypeForMem(Private->getType()), 2519 GlobAddr.getAlignment())); 2520 switch (CGF.getEvaluationKind(Private->getType())) { 2521 case TEK_Scalar: { 2522 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); 2523 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(), 2524 LValueBaseInfo(AlignmentSource::Type), 2525 TBAAAccessInfo()); 2526 break; 2527 } 2528 case TEK_Complex: { 2529 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); 2530 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), 2531 /*isInit=*/false); 2532 break; 2533 } 2534 case TEK_Aggregate: 2535 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), 2536 GlobLVal, Private->getType(), 2537 AggValueSlot::DoesNotOverlap); 2538 break; 2539 } 2540 ++Idx; 2541 } 2542 2543 CGF.FinishFunction(); 2544 return Fn; 2545 } 2546 2547 /// This function emits a helper that reduces all the reduction variables from 2548 /// the team into the provided global buffer for the reduction variables. 2549 /// 2550 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) 2551 /// void *GlobPtrs[]; 2552 /// GlobPtrs[0] = (void*)&buffer.D0[Idx]; 2553 /// ... 2554 /// GlobPtrs[N] = (void*)&buffer.DN[Idx]; 2555 /// reduce_function(reduce_data, GlobPtrs); 2556 static llvm::Value *emitGlobalToListReduceFunction( 2557 CodeGenModule &CGM, ArrayRef<const Expr *> Privates, 2558 QualType ReductionArrayTy, SourceLocation Loc, 2559 const RecordDecl *TeamReductionRec, 2560 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 2561 &VarFieldMap, 2562 llvm::Function *ReduceFn) { 2563 ASTContext &C = CGM.getContext(); 2564 2565 // Buffer: global reduction buffer. 2566 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2567 C.VoidPtrTy, ImplicitParamDecl::Other); 2568 // Idx: index of the buffer. 2569 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, 2570 ImplicitParamDecl::Other); 2571 // ReduceList: thread local Reduce list. 2572 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, 2573 C.VoidPtrTy, ImplicitParamDecl::Other); 2574 FunctionArgList Args; 2575 Args.push_back(&BufferArg); 2576 Args.push_back(&IdxArg); 2577 Args.push_back(&ReduceListArg); 2578 2579 const CGFunctionInfo &CGFI = 2580 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); 2581 auto *Fn = llvm::Function::Create( 2582 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 2583 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); 2584 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 2585 Fn->setDoesNotRecurse(); 2586 CodeGenFunction CGF(CGM); 2587 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); 2588 2589 CGBuilderTy &Bld = CGF.Builder; 2590 2591 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); 2592 QualType StaticTy = C.getRecordType(TeamReductionRec); 2593 llvm::Type *LLVMReductionsBufferTy = 2594 CGM.getTypes().ConvertTypeForMem(StaticTy); 2595 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 2596 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), 2597 LLVMReductionsBufferTy->getPointerTo()); 2598 2599 // 1. Build a list of reduction variables. 2600 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 2601 Address ReductionList = 2602 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 2603 auto IPriv = Privates.begin(); 2604 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), 2605 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), 2606 /*Volatile=*/false, C.IntTy, 2607 Loc)}; 2608 unsigned Idx = 0; 2609 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { 2610 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2611 // Global = Buffer.VD[Idx]; 2612 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl(); 2613 const FieldDecl *FD = VarFieldMap.lookup(VD); 2614 LValue GlobLVal = CGF.EmitLValueForField( 2615 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); 2616 Address GlobAddr = GlobLVal.getAddress(CGF); 2617 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP( 2618 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs); 2619 CGF.EmitStoreOfScalar(BufferPtr, Elem, /*Volatile=*/false, C.VoidPtrTy); 2620 if ((*IPriv)->getType()->isVariablyModifiedType()) { 2621 // Store array size. 2622 ++Idx; 2623 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2624 llvm::Value *Size = CGF.Builder.CreateIntCast( 2625 CGF.getVLASize( 2626 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 2627 .NumElts, 2628 CGF.SizeTy, /*isSigned=*/false); 2629 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 2630 Elem); 2631 } 2632 } 2633 2634 // Call reduce_function(ReduceList, GlobalReduceList) 2635 llvm::Value *GlobalReduceList = ReductionList.getPointer(); 2636 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); 2637 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( 2638 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); 2639 CGM.getOpenMPRuntime().emitOutlinedFunctionCall( 2640 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); 2641 CGF.FinishFunction(); 2642 return Fn; 2643 } 2644 2645 /// 2646 /// Design of OpenMP reductions on the GPU 2647 /// 2648 /// Consider a typical OpenMP program with one or more reduction 2649 /// clauses: 2650 /// 2651 /// float foo; 2652 /// double bar; 2653 /// #pragma omp target teams distribute parallel for \ 2654 /// reduction(+:foo) reduction(*:bar) 2655 /// for (int i = 0; i < N; i++) { 2656 /// foo += A[i]; bar *= B[i]; 2657 /// } 2658 /// 2659 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 2660 /// all teams. In our OpenMP implementation on the NVPTX device an 2661 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 2662 /// within a team are mapped to CUDA threads within a threadblock. 2663 /// Our goal is to efficiently aggregate values across all OpenMP 2664 /// threads such that: 2665 /// 2666 /// - the compiler and runtime are logically concise, and 2667 /// - the reduction is performed efficiently in a hierarchical 2668 /// manner as follows: within OpenMP threads in the same warp, 2669 /// across warps in a threadblock, and finally across teams on 2670 /// the NVPTX device. 2671 /// 2672 /// Introduction to Decoupling 2673 /// 2674 /// We would like to decouple the compiler and the runtime so that the 2675 /// latter is ignorant of the reduction variables (number, data types) 2676 /// and the reduction operators. This allows a simpler interface 2677 /// and implementation while still attaining good performance. 2678 /// 2679 /// Pseudocode for the aforementioned OpenMP program generated by the 2680 /// compiler is as follows: 2681 /// 2682 /// 1. Create private copies of reduction variables on each OpenMP 2683 /// thread: 'foo_private', 'bar_private' 2684 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 2685 /// to it and writes the result in 'foo_private' and 'bar_private' 2686 /// respectively. 2687 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 2688 /// and store the result on the team master: 2689 /// 2690 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 2691 /// reduceData, shuffleReduceFn, interWarpCpyFn) 2692 /// 2693 /// where: 2694 /// struct ReduceData { 2695 /// double *foo; 2696 /// double *bar; 2697 /// } reduceData 2698 /// reduceData.foo = &foo_private 2699 /// reduceData.bar = &bar_private 2700 /// 2701 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 2702 /// auxiliary functions generated by the compiler that operate on 2703 /// variables of type 'ReduceData'. They aid the runtime perform 2704 /// algorithmic steps in a data agnostic manner. 2705 /// 2706 /// 'shuffleReduceFn' is a pointer to a function that reduces data 2707 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 2708 /// same warp. It takes the following arguments as input: 2709 /// 2710 /// a. variable of type 'ReduceData' on the calling lane, 2711 /// b. its lane_id, 2712 /// c. an offset relative to the current lane_id to generate a 2713 /// remote_lane_id. The remote lane contains the second 2714 /// variable of type 'ReduceData' that is to be reduced. 2715 /// d. an algorithm version parameter determining which reduction 2716 /// algorithm to use. 2717 /// 2718 /// 'shuffleReduceFn' retrieves data from the remote lane using 2719 /// efficient GPU shuffle intrinsics and reduces, using the 2720 /// algorithm specified by the 4th parameter, the two operands 2721 /// element-wise. The result is written to the first operand. 2722 /// 2723 /// Different reduction algorithms are implemented in different 2724 /// runtime functions, all calling 'shuffleReduceFn' to perform 2725 /// the essential reduction step. Therefore, based on the 4th 2726 /// parameter, this function behaves slightly differently to 2727 /// cooperate with the runtime to ensure correctness under 2728 /// different circumstances. 2729 /// 2730 /// 'InterWarpCpyFn' is a pointer to a function that transfers 2731 /// reduced variables across warps. It tunnels, through CUDA 2732 /// shared memory, the thread-private data of type 'ReduceData' 2733 /// from lane 0 of each warp to a lane in the first warp. 2734 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 2735 /// The last team writes the global reduced value to memory. 2736 /// 2737 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 2738 /// reduceData, shuffleReduceFn, interWarpCpyFn, 2739 /// scratchpadCopyFn, loadAndReduceFn) 2740 /// 2741 /// 'scratchpadCopyFn' is a helper that stores reduced 2742 /// data from the team master to a scratchpad array in 2743 /// global memory. 2744 /// 2745 /// 'loadAndReduceFn' is a helper that loads data from 2746 /// the scratchpad array and reduces it with the input 2747 /// operand. 2748 /// 2749 /// These compiler generated functions hide address 2750 /// calculation and alignment information from the runtime. 2751 /// 5. if ret == 1: 2752 /// The team master of the last team stores the reduced 2753 /// result to the globals in memory. 2754 /// foo += reduceData.foo; bar *= reduceData.bar 2755 /// 2756 /// 2757 /// Warp Reduction Algorithms 2758 /// 2759 /// On the warp level, we have three algorithms implemented in the 2760 /// OpenMP runtime depending on the number of active lanes: 2761 /// 2762 /// Full Warp Reduction 2763 /// 2764 /// The reduce algorithm within a warp where all lanes are active 2765 /// is implemented in the runtime as follows: 2766 /// 2767 /// full_warp_reduce(void *reduce_data, 2768 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 2769 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 2770 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 2771 /// } 2772 /// 2773 /// The algorithm completes in log(2, WARPSIZE) steps. 2774 /// 2775 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 2776 /// not used therefore we save instructions by not retrieving lane_id 2777 /// from the corresponding special registers. The 4th parameter, which 2778 /// represents the version of the algorithm being used, is set to 0 to 2779 /// signify full warp reduction. 2780 /// 2781 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 2782 /// 2783 /// #reduce_elem refers to an element in the local lane's data structure 2784 /// #remote_elem is retrieved from a remote lane 2785 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 2786 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 2787 /// 2788 /// Contiguous Partial Warp Reduction 2789 /// 2790 /// This reduce algorithm is used within a warp where only the first 2791 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 2792 /// number of OpenMP threads in a parallel region is not a multiple of 2793 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 2794 /// 2795 /// void 2796 /// contiguous_partial_reduce(void *reduce_data, 2797 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 2798 /// int size, int lane_id) { 2799 /// int curr_size; 2800 /// int offset; 2801 /// curr_size = size; 2802 /// mask = curr_size/2; 2803 /// while (offset>0) { 2804 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 2805 /// curr_size = (curr_size+1)/2; 2806 /// offset = curr_size/2; 2807 /// } 2808 /// } 2809 /// 2810 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 2811 /// 2812 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 2813 /// if (lane_id < offset) 2814 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 2815 /// else 2816 /// reduce_elem = remote_elem 2817 /// 2818 /// This algorithm assumes that the data to be reduced are located in a 2819 /// contiguous subset of lanes starting from the first. When there is 2820 /// an odd number of active lanes, the data in the last lane is not 2821 /// aggregated with any other lane's dat but is instead copied over. 2822 /// 2823 /// Dispersed Partial Warp Reduction 2824 /// 2825 /// This algorithm is used within a warp when any discontiguous subset of 2826 /// lanes are active. It is used to implement the reduction operation 2827 /// across lanes in an OpenMP simd region or in a nested parallel region. 2828 /// 2829 /// void 2830 /// dispersed_partial_reduce(void *reduce_data, 2831 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 2832 /// int size, remote_id; 2833 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 2834 /// do { 2835 /// remote_id = next_active_lane_id_right_after_me(); 2836 /// # the above function returns 0 of no active lane 2837 /// # is present right after the current lane. 2838 /// size = number_of_active_lanes_in_this_warp(); 2839 /// logical_lane_id /= 2; 2840 /// ShuffleReduceFn(reduce_data, logical_lane_id, 2841 /// remote_id-1-threadIdx.x, 2); 2842 /// } while (logical_lane_id % 2 == 0 && size > 1); 2843 /// } 2844 /// 2845 /// There is no assumption made about the initial state of the reduction. 2846 /// Any number of lanes (>=1) could be active at any position. The reduction 2847 /// result is returned in the first active lane. 2848 /// 2849 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 2850 /// 2851 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 2852 /// if (lane_id % 2 == 0 && offset > 0) 2853 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 2854 /// else 2855 /// reduce_elem = remote_elem 2856 /// 2857 /// 2858 /// Intra-Team Reduction 2859 /// 2860 /// This function, as implemented in the runtime call 2861 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 2862 /// threads in a team. It first reduces within a warp using the 2863 /// aforementioned algorithms. We then proceed to gather all such 2864 /// reduced values at the first warp. 2865 /// 2866 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 2867 /// data from each of the "warp master" (zeroth lane of each warp, where 2868 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 2869 /// a mathematical sense) the problem of reduction across warp masters in 2870 /// a block to the problem of warp reduction. 2871 /// 2872 /// 2873 /// Inter-Team Reduction 2874 /// 2875 /// Once a team has reduced its data to a single value, it is stored in 2876 /// a global scratchpad array. Since each team has a distinct slot, this 2877 /// can be done without locking. 2878 /// 2879 /// The last team to write to the scratchpad array proceeds to reduce the 2880 /// scratchpad array. One or more workers in the last team use the helper 2881 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 2882 /// the k'th worker reduces every k'th element. 2883 /// 2884 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 2885 /// reduce across workers and compute a globally reduced value. 2886 /// 2887 void CGOpenMPRuntimeGPU::emitReduction( 2888 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 2889 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 2890 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 2891 if (!CGF.HaveInsertPoint()) 2892 return; 2893 2894 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 2895 #ifndef NDEBUG 2896 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 2897 #endif 2898 2899 if (Options.SimpleReduction) { 2900 assert(!TeamsReduction && !ParallelReduction && 2901 "Invalid reduction selection in emitReduction."); 2902 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 2903 ReductionOps, Options); 2904 return; 2905 } 2906 2907 assert((TeamsReduction || ParallelReduction) && 2908 "Invalid reduction selection in emitReduction."); 2909 2910 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList), 2911 // RedList, shuffle_reduce_func, interwarp_copy_func); 2912 // or 2913 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>); 2914 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 2915 llvm::Value *ThreadId = getThreadID(CGF, Loc); 2916 2917 llvm::Value *Res; 2918 ASTContext &C = CGM.getContext(); 2919 // 1. Build a list of reduction variables. 2920 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]}; 2921 auto Size = RHSExprs.size(); 2922 for (const Expr *E : Privates) { 2923 if (E->getType()->isVariablyModifiedType()) 2924 // Reserve place for array size. 2925 ++Size; 2926 } 2927 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); 2928 QualType ReductionArrayTy = 2929 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal, 2930 /*IndexTypeQuals=*/0); 2931 Address ReductionList = 2932 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); 2933 auto IPriv = Privates.begin(); 2934 unsigned Idx = 0; 2935 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { 2936 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2937 CGF.Builder.CreateStore( 2938 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2939 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy), 2940 Elem); 2941 if ((*IPriv)->getType()->isVariablyModifiedType()) { 2942 // Store array size. 2943 ++Idx; 2944 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); 2945 llvm::Value *Size = CGF.Builder.CreateIntCast( 2946 CGF.getVLASize( 2947 CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) 2948 .NumElts, 2949 CGF.SizeTy, /*isSigned=*/false); 2950 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), 2951 Elem); 2952 } 2953 } 2954 2955 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2956 ReductionList.getPointer(), CGF.VoidPtrTy); 2957 llvm::Function *ReductionFn = emitReductionFunction( 2958 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy), 2959 Privates, LHSExprs, RHSExprs, ReductionOps); 2960 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); 2961 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( 2962 CGM, Privates, ReductionArrayTy, ReductionFn, Loc); 2963 llvm::Value *InterWarpCopyFn = 2964 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); 2965 2966 if (ParallelReduction) { 2967 llvm::Value *Args[] = {RTLoc, 2968 ThreadId, 2969 CGF.Builder.getInt32(RHSExprs.size()), 2970 ReductionArrayTySize, 2971 RL, 2972 ShuffleAndReduceFn, 2973 InterWarpCopyFn}; 2974 2975 Res = CGF.EmitRuntimeCall( 2976 OMPBuilder.getOrCreateRuntimeFunction( 2977 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2), 2978 Args); 2979 } else { 2980 assert(TeamsReduction && "expected teams reduction."); 2981 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 2982 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 2983 int Cnt = 0; 2984 for (const Expr *DRE : Privates) { 2985 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 2986 ++Cnt; 2987 } 2988 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( 2989 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 2990 C.getLangOpts().OpenMPCUDAReductionBufNum); 2991 TeamsReductions.push_back(TeamReductionRec); 2992 if (!KernelTeamsReductionPtr) { 2993 KernelTeamsReductionPtr = new llvm::GlobalVariable( 2994 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, 2995 llvm::GlobalValue::InternalLinkage, nullptr, 2996 "_openmp_teams_reductions_buffer_$_$ptr"); 2997 } 2998 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( 2999 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()), 3000 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); 3001 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( 3002 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3003 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( 3004 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3005 ReductionFn); 3006 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( 3007 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); 3008 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( 3009 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, 3010 ReductionFn); 3011 3012 llvm::Value *Args[] = { 3013 RTLoc, 3014 ThreadId, 3015 GlobalBufferPtr, 3016 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), 3017 RL, 3018 ShuffleAndReduceFn, 3019 InterWarpCopyFn, 3020 GlobalToBufferCpyFn, 3021 GlobalToBufferRedFn, 3022 BufferToGlobalCpyFn, 3023 BufferToGlobalRedFn}; 3024 3025 Res = CGF.EmitRuntimeCall( 3026 OMPBuilder.getOrCreateRuntimeFunction( 3027 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2), 3028 Args); 3029 } 3030 3031 // 5. Build if (res == 1) 3032 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done"); 3033 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then"); 3034 llvm::Value *Cond = CGF.Builder.CreateICmpEQ( 3035 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1)); 3036 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB); 3037 3038 // 6. Build then branch: where we have reduced values in the master 3039 // thread in each team. 3040 // __kmpc_end_reduce{_nowait}(<gtid>); 3041 // break; 3042 CGF.EmitBlock(ThenBB); 3043 3044 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>); 3045 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps, 3046 this](CodeGenFunction &CGF, PrePostActionTy &Action) { 3047 auto IPriv = Privates.begin(); 3048 auto ILHS = LHSExprs.begin(); 3049 auto IRHS = RHSExprs.begin(); 3050 for (const Expr *E : ReductionOps) { 3051 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS), 3052 cast<DeclRefExpr>(*IRHS)); 3053 ++IPriv; 3054 ++ILHS; 3055 ++IRHS; 3056 } 3057 }; 3058 llvm::Value *EndArgs[] = {ThreadId}; 3059 RegionCodeGenTy RCG(CodeGen); 3060 NVPTXActionTy Action( 3061 nullptr, std::nullopt, 3062 OMPBuilder.getOrCreateRuntimeFunction( 3063 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait), 3064 EndArgs); 3065 RCG.setAction(Action); 3066 RCG(CGF); 3067 // There is no need to emit line number for unconditional branch. 3068 (void)ApplyDebugLocation::CreateEmpty(CGF); 3069 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 3070 } 3071 3072 const VarDecl * 3073 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 3074 const VarDecl *NativeParam) const { 3075 if (!NativeParam->getType()->isReferenceType()) 3076 return NativeParam; 3077 QualType ArgType = NativeParam->getType(); 3078 QualifierCollector QC; 3079 const Type *NonQualTy = QC.strip(ArgType); 3080 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3081 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 3082 if (Attr->getCaptureKind() == OMPC_map) { 3083 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 3084 LangAS::opencl_global); 3085 } 3086 } 3087 ArgType = CGM.getContext().getPointerType(PointeeTy); 3088 QC.addRestrict(); 3089 enum { NVPTX_local_addr = 5 }; 3090 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 3091 ArgType = QC.apply(CGM.getContext(), ArgType); 3092 if (isa<ImplicitParamDecl>(NativeParam)) 3093 return ImplicitParamDecl::Create( 3094 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 3095 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other); 3096 return ParmVarDecl::Create( 3097 CGM.getContext(), 3098 const_cast<DeclContext *>(NativeParam->getDeclContext()), 3099 NativeParam->getBeginLoc(), NativeParam->getLocation(), 3100 NativeParam->getIdentifier(), ArgType, 3101 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 3102 } 3103 3104 Address 3105 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 3106 const VarDecl *NativeParam, 3107 const VarDecl *TargetParam) const { 3108 assert(NativeParam != TargetParam && 3109 NativeParam->getType()->isReferenceType() && 3110 "Native arg must not be the same as target arg."); 3111 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 3112 QualType NativeParamType = NativeParam->getType(); 3113 QualifierCollector QC; 3114 const Type *NonQualTy = QC.strip(NativeParamType); 3115 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 3116 unsigned NativePointeeAddrSpace = 3117 CGF.getTypes().getTargetAddressSpace(NativePointeeTy); 3118 QualType TargetTy = TargetParam->getType(); 3119 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, 3120 TargetTy, SourceLocation()); 3121 // First cast to generic. 3122 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3123 TargetAddr, 3124 llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace=*/0)); 3125 // Cast from generic to native address space. 3126 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3127 TargetAddr, 3128 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); 3129 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 3130 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 3131 NativeParamType); 3132 return NativeParamAddr; 3133 } 3134 3135 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 3136 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 3137 ArrayRef<llvm::Value *> Args) const { 3138 SmallVector<llvm::Value *, 4> TargetArgs; 3139 TargetArgs.reserve(Args.size()); 3140 auto *FnType = OutlinedFn.getFunctionType(); 3141 for (unsigned I = 0, E = Args.size(); I < E; ++I) { 3142 if (FnType->isVarArg() && FnType->getNumParams() <= I) { 3143 TargetArgs.append(std::next(Args.begin(), I), Args.end()); 3144 break; 3145 } 3146 llvm::Type *TargetType = FnType->getParamType(I); 3147 llvm::Value *NativeArg = Args[I]; 3148 if (!TargetType->isPointerTy()) { 3149 TargetArgs.emplace_back(NativeArg); 3150 continue; 3151 } 3152 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3153 NativeArg, 3154 llvm::PointerType::get(CGF.getLLVMContext(), /*AddrSpace*/ 0)); 3155 TargetArgs.emplace_back( 3156 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType)); 3157 } 3158 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 3159 } 3160 3161 /// Emit function which wraps the outline parallel region 3162 /// and controls the arguments which are passed to this function. 3163 /// The wrapper ensures that the outlined function is called 3164 /// with the correct arguments when data is shared. 3165 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 3166 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 3167 ASTContext &Ctx = CGM.getContext(); 3168 const auto &CS = *D.getCapturedStmt(OMPD_parallel); 3169 3170 // Create a function that takes as argument the source thread. 3171 FunctionArgList WrapperArgs; 3172 QualType Int16QTy = 3173 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 3174 QualType Int32QTy = 3175 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 3176 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 3177 /*Id=*/nullptr, Int16QTy, 3178 ImplicitParamDecl::Other); 3179 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 3180 /*Id=*/nullptr, Int32QTy, 3181 ImplicitParamDecl::Other); 3182 WrapperArgs.emplace_back(&ParallelLevelArg); 3183 WrapperArgs.emplace_back(&WrapperArg); 3184 3185 const CGFunctionInfo &CGFI = 3186 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 3187 3188 auto *Fn = llvm::Function::Create( 3189 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 3190 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 3191 3192 // Ensure we do not inline the function. This is trivially true for the ones 3193 // passed to __kmpc_fork_call but the ones calles in serialized regions 3194 // could be inlined. This is not a perfect but it is closer to the invariant 3195 // we want, namely, every data environment starts with a new function. 3196 // TODO: We should pass the if condition to the runtime function and do the 3197 // handling there. Much cleaner code. 3198 Fn->addFnAttr(llvm::Attribute::NoInline); 3199 3200 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 3201 Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 3202 Fn->setDoesNotRecurse(); 3203 3204 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 3205 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 3206 D.getBeginLoc(), D.getBeginLoc()); 3207 3208 const auto *RD = CS.getCapturedRecordDecl(); 3209 auto CurField = RD->field_begin(); 3210 3211 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 3212 /*Name=*/".zero.addr"); 3213 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 3214 // Get the array of arguments. 3215 SmallVector<llvm::Value *, 8> Args; 3216 3217 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer()); 3218 Args.emplace_back(ZeroAddr.getPointer()); 3219 3220 CGBuilderTy &Bld = CGF.Builder; 3221 auto CI = CS.capture_begin(); 3222 3223 // Use global memory for data sharing. 3224 // Handle passing of global args to workers. 3225 Address GlobalArgs = 3226 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 3227 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 3228 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 3229 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 3230 CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 3231 DataSharingArgs); 3232 3233 // Retrieve the shared variables from the list of references returned 3234 // by the runtime. Pass the variables to the outlined function. 3235 Address SharedArgListAddress = Address::invalid(); 3236 if (CS.capture_size() > 0 || 3237 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 3238 SharedArgListAddress = CGF.EmitLoadOfPointer( 3239 GlobalArgs, CGF.getContext() 3240 .getPointerType(CGF.getContext().VoidPtrTy) 3241 .castAs<PointerType>()); 3242 } 3243 unsigned Idx = 0; 3244 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 3245 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 3246 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 3247 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 3248 llvm::Value *LB = CGF.EmitLoadOfScalar( 3249 TypedAddress, 3250 /*Volatile=*/false, 3251 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 3252 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 3253 Args.emplace_back(LB); 3254 ++Idx; 3255 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 3256 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 3257 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 3258 llvm::Value *UB = CGF.EmitLoadOfScalar( 3259 TypedAddress, 3260 /*Volatile=*/false, 3261 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 3262 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 3263 Args.emplace_back(UB); 3264 ++Idx; 3265 } 3266 if (CS.capture_size() > 0) { 3267 ASTContext &CGFContext = CGF.getContext(); 3268 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 3269 QualType ElemTy = CurField->getType(); 3270 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 3271 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 3272 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), 3273 CGF.ConvertTypeForMem(ElemTy)); 3274 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 3275 /*Volatile=*/false, 3276 CGFContext.getPointerType(ElemTy), 3277 CI->getLocation()); 3278 if (CI->capturesVariableByCopy() && 3279 !CI->getCapturedVar()->getType()->isAnyPointerType()) { 3280 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 3281 CI->getLocation()); 3282 } 3283 Args.emplace_back(Arg); 3284 } 3285 } 3286 3287 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 3288 CGF.FinishFunction(); 3289 return Fn; 3290 } 3291 3292 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 3293 const Decl *D) { 3294 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 3295 return; 3296 3297 assert(D && "Expected function or captured|block decl."); 3298 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 3299 "Function is registered already."); 3300 assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 3301 "Team is set but not processed."); 3302 const Stmt *Body = nullptr; 3303 bool NeedToDelayGlobalization = false; 3304 if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 3305 Body = FD->getBody(); 3306 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 3307 Body = BD->getBody(); 3308 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 3309 Body = CD->getBody(); 3310 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 3311 if (NeedToDelayGlobalization && 3312 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 3313 return; 3314 } 3315 if (!Body) 3316 return; 3317 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 3318 VarChecker.Visit(Body); 3319 const RecordDecl *GlobalizedVarsRecord = 3320 VarChecker.getGlobalizedRecord(IsInTTDRegion); 3321 TeamAndReductions.first = nullptr; 3322 TeamAndReductions.second.clear(); 3323 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 3324 VarChecker.getEscapedVariableLengthDecls(); 3325 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = 3326 VarChecker.getDelayedVariableLengthDecls(); 3327 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && 3328 DelayedVariableLengthDecls.empty()) 3329 return; 3330 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 3331 I->getSecond().MappedParams = 3332 std::make_unique<CodeGenFunction::OMPMapVars>(); 3333 I->getSecond().EscapedParameters.insert( 3334 VarChecker.getEscapedParameters().begin(), 3335 VarChecker.getEscapedParameters().end()); 3336 I->getSecond().EscapedVariableLengthDecls.append( 3337 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 3338 I->getSecond().DelayedVariableLengthDecls.append( 3339 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); 3340 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 3341 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 3342 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 3343 Data.insert(std::make_pair(VD, MappedVarData())); 3344 } 3345 if (!NeedToDelayGlobalization) { 3346 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); 3347 struct GlobalizationScope final : EHScopeStack::Cleanup { 3348 GlobalizationScope() = default; 3349 3350 void Emit(CodeGenFunction &CGF, Flags flags) override { 3351 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 3352 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true); 3353 } 3354 }; 3355 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 3356 } 3357 } 3358 3359 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 3360 const VarDecl *VD) { 3361 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 3362 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 3363 auto AS = LangAS::Default; 3364 switch (A->getAllocatorType()) { 3365 // Use the default allocator here as by default local vars are 3366 // threadlocal. 3367 case OMPAllocateDeclAttr::OMPNullMemAlloc: 3368 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 3369 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 3370 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 3371 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 3372 // Follow the user decision - use default allocation. 3373 return Address::invalid(); 3374 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 3375 // TODO: implement aupport for user-defined allocators. 3376 return Address::invalid(); 3377 case OMPAllocateDeclAttr::OMPConstMemAlloc: 3378 AS = LangAS::cuda_constant; 3379 break; 3380 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 3381 AS = LangAS::cuda_shared; 3382 break; 3383 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 3384 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 3385 break; 3386 } 3387 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 3388 auto *GV = new llvm::GlobalVariable( 3389 CGM.getModule(), VarTy, /*isConstant=*/false, 3390 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), 3391 VD->getName(), 3392 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 3393 CGM.getContext().getTargetAddressSpace(AS)); 3394 CharUnits Align = CGM.getContext().getDeclAlign(VD); 3395 GV->setAlignment(Align.getAsAlign()); 3396 return Address( 3397 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 3398 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( 3399 VD->getType().getAddressSpace()))), 3400 VarTy, Align); 3401 } 3402 3403 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) 3404 return Address::invalid(); 3405 3406 VD = VD->getCanonicalDecl(); 3407 auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 3408 if (I == FunctionGlobalizedDecls.end()) 3409 return Address::invalid(); 3410 auto VDI = I->getSecond().LocalVarData.find(VD); 3411 if (VDI != I->getSecond().LocalVarData.end()) 3412 return VDI->second.PrivateAddr; 3413 if (VD->hasAttrs()) { 3414 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 3415 E(VD->attr_end()); 3416 IT != E; ++IT) { 3417 auto VDI = I->getSecond().LocalVarData.find( 3418 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 3419 ->getCanonicalDecl()); 3420 if (VDI != I->getSecond().LocalVarData.end()) 3421 return VDI->second.PrivateAddr; 3422 } 3423 } 3424 3425 return Address::invalid(); 3426 } 3427 3428 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 3429 FunctionGlobalizedDecls.erase(CGF.CurFn); 3430 CGOpenMPRuntime::functionFinished(CGF); 3431 } 3432 3433 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 3434 CodeGenFunction &CGF, const OMPLoopDirective &S, 3435 OpenMPDistScheduleClauseKind &ScheduleKind, 3436 llvm::Value *&Chunk) const { 3437 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 3438 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 3439 ScheduleKind = OMPC_DIST_SCHEDULE_static; 3440 Chunk = CGF.EmitScalarConversion( 3441 RT.getGPUNumThreads(CGF), 3442 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 3443 S.getIterationVariable()->getType(), S.getBeginLoc()); 3444 return; 3445 } 3446 CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 3447 CGF, S, ScheduleKind, Chunk); 3448 } 3449 3450 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 3451 CodeGenFunction &CGF, const OMPLoopDirective &S, 3452 OpenMPScheduleClauseKind &ScheduleKind, 3453 const Expr *&ChunkExpr) const { 3454 ScheduleKind = OMPC_SCHEDULE_static; 3455 // Chunk size is 1 in this case. 3456 llvm::APInt ChunkSize(32, 1); 3457 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 3458 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 3459 SourceLocation()); 3460 } 3461 3462 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 3463 CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 3464 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 3465 " Expected target-based directive."); 3466 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 3467 for (const CapturedStmt::Capture &C : CS->captures()) { 3468 // Capture variables captured by reference in lambdas for target-based 3469 // directives. 3470 if (!C.capturesVariable()) 3471 continue; 3472 const VarDecl *VD = C.getCapturedVar(); 3473 const auto *RD = VD->getType() 3474 .getCanonicalType() 3475 .getNonReferenceType() 3476 ->getAsCXXRecordDecl(); 3477 if (!RD || !RD->isLambda()) 3478 continue; 3479 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 3480 LValue VDLVal; 3481 if (VD->getType().getCanonicalType()->isReferenceType()) 3482 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 3483 else 3484 VDLVal = CGF.MakeAddrLValue( 3485 VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 3486 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; 3487 FieldDecl *ThisCapture = nullptr; 3488 RD->getCaptureFields(Captures, ThisCapture); 3489 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 3490 LValue ThisLVal = 3491 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 3492 llvm::Value *CXXThis = CGF.LoadCXXThis(); 3493 CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 3494 } 3495 for (const LambdaCapture &LC : RD->captures()) { 3496 if (LC.getCaptureKind() != LCK_ByRef) 3497 continue; 3498 const ValueDecl *VD = LC.getCapturedVar(); 3499 // FIXME: For now VD is always a VarDecl because OpenMP does not support 3500 // capturing structured bindings in lambdas yet. 3501 if (!CS->capturesVariable(cast<VarDecl>(VD))) 3502 continue; 3503 auto It = Captures.find(VD); 3504 assert(It != Captures.end() && "Found lambda capture without field."); 3505 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 3506 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); 3507 if (VD->getType().getCanonicalType()->isReferenceType()) 3508 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 3509 VD->getType().getCanonicalType()) 3510 .getAddress(CGF); 3511 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal); 3512 } 3513 } 3514 } 3515 3516 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 3517 LangAS &AS) { 3518 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 3519 return false; 3520 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 3521 switch(A->getAllocatorType()) { 3522 case OMPAllocateDeclAttr::OMPNullMemAlloc: 3523 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 3524 // Not supported, fallback to the default mem space. 3525 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 3526 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 3527 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 3528 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 3529 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 3530 AS = LangAS::Default; 3531 return true; 3532 case OMPAllocateDeclAttr::OMPConstMemAlloc: 3533 AS = LangAS::cuda_constant; 3534 return true; 3535 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 3536 AS = LangAS::cuda_shared; 3537 return true; 3538 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 3539 llvm_unreachable("Expected predefined allocator for the variables with the " 3540 "static storage."); 3541 } 3542 return false; 3543 } 3544 3545 // Get current CudaArch and ignore any unknown values 3546 static CudaArch getCudaArch(CodeGenModule &CGM) { 3547 if (!CGM.getTarget().hasFeature("ptx")) 3548 return CudaArch::UNKNOWN; 3549 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 3550 if (Feature.getValue()) { 3551 CudaArch Arch = StringToCudaArch(Feature.getKey()); 3552 if (Arch != CudaArch::UNKNOWN) 3553 return Arch; 3554 } 3555 } 3556 return CudaArch::UNKNOWN; 3557 } 3558 3559 /// Check to see if target architecture supports unified addressing which is 3560 /// a restriction for OpenMP requires clause "unified_shared_memory". 3561 void CGOpenMPRuntimeGPU::processRequiresDirective( 3562 const OMPRequiresDecl *D) { 3563 for (const OMPClause *Clause : D->clauselists()) { 3564 if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 3565 CudaArch Arch = getCudaArch(CGM); 3566 switch (Arch) { 3567 case CudaArch::SM_20: 3568 case CudaArch::SM_21: 3569 case CudaArch::SM_30: 3570 case CudaArch::SM_32: 3571 case CudaArch::SM_35: 3572 case CudaArch::SM_37: 3573 case CudaArch::SM_50: 3574 case CudaArch::SM_52: 3575 case CudaArch::SM_53: { 3576 SmallString<256> Buffer; 3577 llvm::raw_svector_ostream Out(Buffer); 3578 Out << "Target architecture " << CudaArchToString(Arch) 3579 << " does not support unified addressing"; 3580 CGM.Error(Clause->getBeginLoc(), Out.str()); 3581 return; 3582 } 3583 case CudaArch::SM_60: 3584 case CudaArch::SM_61: 3585 case CudaArch::SM_62: 3586 case CudaArch::SM_70: 3587 case CudaArch::SM_72: 3588 case CudaArch::SM_75: 3589 case CudaArch::SM_80: 3590 case CudaArch::SM_86: 3591 case CudaArch::SM_87: 3592 case CudaArch::SM_89: 3593 case CudaArch::SM_90: 3594 case CudaArch::GFX600: 3595 case CudaArch::GFX601: 3596 case CudaArch::GFX602: 3597 case CudaArch::GFX700: 3598 case CudaArch::GFX701: 3599 case CudaArch::GFX702: 3600 case CudaArch::GFX703: 3601 case CudaArch::GFX704: 3602 case CudaArch::GFX705: 3603 case CudaArch::GFX801: 3604 case CudaArch::GFX802: 3605 case CudaArch::GFX803: 3606 case CudaArch::GFX805: 3607 case CudaArch::GFX810: 3608 case CudaArch::GFX900: 3609 case CudaArch::GFX902: 3610 case CudaArch::GFX904: 3611 case CudaArch::GFX906: 3612 case CudaArch::GFX908: 3613 case CudaArch::GFX909: 3614 case CudaArch::GFX90a: 3615 case CudaArch::GFX90c: 3616 case CudaArch::GFX940: 3617 case CudaArch::GFX941: 3618 case CudaArch::GFX942: 3619 case CudaArch::GFX1010: 3620 case CudaArch::GFX1011: 3621 case CudaArch::GFX1012: 3622 case CudaArch::GFX1013: 3623 case CudaArch::GFX1030: 3624 case CudaArch::GFX1031: 3625 case CudaArch::GFX1032: 3626 case CudaArch::GFX1033: 3627 case CudaArch::GFX1034: 3628 case CudaArch::GFX1035: 3629 case CudaArch::GFX1036: 3630 case CudaArch::GFX1100: 3631 case CudaArch::GFX1101: 3632 case CudaArch::GFX1102: 3633 case CudaArch::GFX1103: 3634 case CudaArch::GFX1150: 3635 case CudaArch::GFX1151: 3636 case CudaArch::Generic: 3637 case CudaArch::UNUSED: 3638 case CudaArch::UNKNOWN: 3639 break; 3640 case CudaArch::LAST: 3641 llvm_unreachable("Unexpected Cuda arch."); 3642 } 3643 } 3644 } 3645 CGOpenMPRuntime::processRequiresDirective(D); 3646 } 3647 3648 void CGOpenMPRuntimeGPU::clear() { 3649 3650 if (!TeamsReductions.empty()) { 3651 ASTContext &C = CGM.getContext(); 3652 RecordDecl *StaticRD = C.buildImplicitRecord( 3653 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); 3654 StaticRD->startDefinition(); 3655 for (const RecordDecl *TeamReductionRec : TeamsReductions) { 3656 QualType RecTy = C.getRecordType(TeamReductionRec); 3657 auto *Field = FieldDecl::Create( 3658 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 3659 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 3660 /*BW=*/nullptr, /*Mutable=*/false, 3661 /*InitStyle=*/ICIS_NoInit); 3662 Field->setAccess(AS_public); 3663 StaticRD->addDecl(Field); 3664 } 3665 StaticRD->completeDefinition(); 3666 QualType StaticTy = C.getRecordType(StaticRD); 3667 llvm::Type *LLVMReductionsBufferTy = 3668 CGM.getTypes().ConvertTypeForMem(StaticTy); 3669 // FIXME: nvlink does not handle weak linkage correctly (object with the 3670 // different size are reported as erroneous). 3671 // Restore CommonLinkage as soon as nvlink is fixed. 3672 auto *GV = new llvm::GlobalVariable( 3673 CGM.getModule(), LLVMReductionsBufferTy, 3674 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage, 3675 llvm::Constant::getNullValue(LLVMReductionsBufferTy), 3676 "_openmp_teams_reductions_buffer_$_"); 3677 KernelTeamsReductionPtr->setInitializer( 3678 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, 3679 CGM.VoidPtrTy)); 3680 } 3681 CGOpenMPRuntime::clear(); 3682 } 3683 3684 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { 3685 CGBuilderTy &Bld = CGF.Builder; 3686 llvm::Module *M = &CGF.CGM.getModule(); 3687 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; 3688 llvm::Function *F = M->getFunction(LocSize); 3689 if (!F) { 3690 F = llvm::Function::Create( 3691 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false), 3692 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); 3693 } 3694 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads"); 3695 } 3696 3697 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { 3698 ArrayRef<llvm::Value *> Args{}; 3699 return CGF.EmitRuntimeCall( 3700 OMPBuilder.getOrCreateRuntimeFunction( 3701 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), 3702 Args); 3703 } 3704 3705 llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) { 3706 ArrayRef<llvm::Value *> Args{}; 3707 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 3708 CGM.getModule(), OMPRTL___kmpc_get_warp_size), 3709 Args); 3710 } 3711