1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a generalized class for OpenMP runtime code generation 10 // specialized by GPU targets NVPTX and AMDGCN. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGOpenMPRuntimeGPU.h" 15 #include "CodeGenFunction.h" 16 #include "clang/AST/Attr.h" 17 #include "clang/AST/DeclOpenMP.h" 18 #include "clang/AST/OpenMPClause.h" 19 #include "clang/AST/StmtOpenMP.h" 20 #include "clang/AST/StmtVisitor.h" 21 #include "clang/Basic/Cuda.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 24 #include "llvm/Support/MathExtras.h" 25 26 using namespace clang; 27 using namespace CodeGen; 28 using namespace llvm::omp; 29 30 namespace { 31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 32 class NVPTXActionTy final : public PrePostActionTy { 33 llvm::FunctionCallee EnterCallee = nullptr; 34 ArrayRef<llvm::Value *> EnterArgs; 35 llvm::FunctionCallee ExitCallee = nullptr; 36 ArrayRef<llvm::Value *> ExitArgs; 37 bool Conditional = false; 38 llvm::BasicBlock *ContBlock = nullptr; 39 40 public: 41 NVPTXActionTy(llvm::FunctionCallee EnterCallee, 42 ArrayRef<llvm::Value *> EnterArgs, 43 llvm::FunctionCallee ExitCallee, 44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 46 ExitArgs(ExitArgs), Conditional(Conditional) {} 47 void Enter(CodeGenFunction &CGF) override { 48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 49 if (Conditional) { 50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 52 ContBlock = CGF.createBasicBlock("omp_if.end"); 53 // Generate the branch (If-stmt) 54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 55 CGF.EmitBlock(ThenBlock); 56 } 57 } 58 void Done(CodeGenFunction &CGF) { 59 // Emit the rest of blocks/branches 60 CGF.EmitBranch(ContBlock); 61 CGF.EmitBlock(ContBlock, true); 62 } 63 void Exit(CodeGenFunction &CGF) override { 64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 65 } 66 }; 67 68 /// A class to track the execution mode when codegening directives within 69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 70 /// to the target region and used by containing directives such as 'parallel' 71 /// to emit optimized code. 72 class ExecutionRuntimeModesRAII { 73 private: 74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 75 CGOpenMPRuntimeGPU::EM_Unknown; 76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 77 78 public: 79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode) 81 : ExecMode(ExecMode) { 82 SavedExecMode = ExecMode; 83 ExecMode = EntryMode; 84 } 85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } 86 }; 87 88 static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 89 RefExpr = RefExpr->IgnoreParens(); 90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 93 Base = TempASE->getBase()->IgnoreParenImpCasts(); 94 RefExpr = Base; 95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) { 96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) 98 Base = TempOASE->getBase()->IgnoreParenImpCasts(); 99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 100 Base = TempASE->getBase()->IgnoreParenImpCasts(); 101 RefExpr = Base; 102 } 103 RefExpr = RefExpr->IgnoreParenImpCasts(); 104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 106 const auto *ME = cast<MemberExpr>(RefExpr); 107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 108 } 109 110 static RecordDecl *buildRecordForGlobalizedVars( 111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 114 &MappedDeclsFields, 115 int BufSize) { 116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 118 return nullptr; 119 SmallVector<VarsDataTy, 4> GlobalizedVars; 120 for (const ValueDecl *D : EscapedDecls) 121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 122 for (const ValueDecl *D : EscapedDeclsForTeams) 123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 124 125 // Build struct _globalized_locals_ty { 126 // /* globalized vars */[WarSize] align (decl_align) 127 // /* globalized vars */ for EscapedDeclsForTeams 128 // }; 129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 130 GlobalizedRD->startDefinition(); 131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped( 132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end()); 133 for (const auto &Pair : GlobalizedVars) { 134 const ValueDecl *VD = Pair.second; 135 QualType Type = VD->getType(); 136 if (Type->isLValueReferenceType()) 137 Type = C.getPointerType(Type.getNonReferenceType()); 138 else 139 Type = Type.getNonReferenceType(); 140 SourceLocation Loc = VD->getLocation(); 141 FieldDecl *Field; 142 if (SingleEscaped.count(VD)) { 143 Field = FieldDecl::Create( 144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 145 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 146 /*BW=*/nullptr, /*Mutable=*/false, 147 /*InitStyle=*/ICIS_NoInit); 148 Field->setAccess(AS_public); 149 if (VD->hasAttrs()) { 150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 151 E(VD->getAttrs().end()); 152 I != E; ++I) 153 Field->addAttr(*I); 154 } 155 } else { 156 if (BufSize > 1) { 157 llvm::APInt ArraySize(32, BufSize); 158 Type = C.getConstantArrayType(Type, ArraySize, nullptr, 159 ArraySizeModifier::Normal, 0); 160 } 161 Field = FieldDecl::Create( 162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 163 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 164 /*BW=*/nullptr, /*Mutable=*/false, 165 /*InitStyle=*/ICIS_NoInit); 166 Field->setAccess(AS_public); 167 llvm::APInt Align(32, Pair.first.getQuantity()); 168 Field->addAttr(AlignedAttr::CreateImplicit( 169 C, /*IsAlignmentExpr=*/true, 170 IntegerLiteral::Create(C, Align, 171 C.getIntTypeForBitwidth(32, /*Signed=*/0), 172 SourceLocation()), 173 {}, AlignedAttr::GNU_aligned)); 174 } 175 GlobalizedRD->addDecl(Field); 176 MappedDeclsFields.try_emplace(VD, Field); 177 } 178 GlobalizedRD->completeDefinition(); 179 return GlobalizedRD; 180 } 181 182 /// Get the list of variables that can escape their declaration context. 183 class CheckVarsEscapingDeclContext final 184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 185 CodeGenFunction &CGF; 186 llvm::SetVector<const ValueDecl *> EscapedDecls; 187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; 189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 190 RecordDecl *GlobalizedRD = nullptr; 191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 192 bool AllEscaped = false; 193 bool IsForCombinedParallelRegion = false; 194 195 void markAsEscaped(const ValueDecl *VD) { 196 // Do not globalize declare target variables. 197 if (!isa<VarDecl>(VD) || 198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 199 return; 200 VD = cast<ValueDecl>(VD->getCanonicalDecl()); 201 // Use user-specified allocation. 202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 203 return; 204 // Variables captured by value must be globalized. 205 bool IsCaptured = false; 206 if (auto *CSI = CGF.CapturedStmtInfo) { 207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 208 // Check if need to capture the variable that was already captured by 209 // value in the outer region. 210 IsCaptured = true; 211 if (!IsForCombinedParallelRegion) { 212 if (!FD->hasAttrs()) 213 return; 214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 215 if (!Attr) 216 return; 217 if (((Attr->getCaptureKind() != OMPC_map) && 218 !isOpenMPPrivate(Attr->getCaptureKind())) || 219 ((Attr->getCaptureKind() == OMPC_map) && 220 !FD->getType()->isAnyPointerType())) 221 return; 222 } 223 if (!FD->getType()->isReferenceType()) { 224 assert(!VD->getType()->isVariablyModifiedType() && 225 "Parameter captured by value with variably modified type"); 226 EscapedParameters.insert(VD); 227 } else if (!IsForCombinedParallelRegion) { 228 return; 229 } 230 } 231 } 232 if ((!CGF.CapturedStmtInfo || 233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 234 VD->getType()->isReferenceType()) 235 // Do not globalize variables with reference type. 236 return; 237 if (VD->getType()->isVariablyModifiedType()) { 238 // If not captured at the target region level then mark the escaped 239 // variable as delayed. 240 if (IsCaptured) 241 EscapedVariableLengthDecls.insert(VD); 242 else 243 DelayedVariableLengthDecls.insert(VD); 244 } else 245 EscapedDecls.insert(VD); 246 } 247 248 void VisitValueDecl(const ValueDecl *VD) { 249 if (VD->getType()->isLValueReferenceType()) 250 markAsEscaped(VD); 251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 253 const bool SavedAllEscaped = AllEscaped; 254 AllEscaped = VD->getType()->isLValueReferenceType(); 255 Visit(VarD->getInit()); 256 AllEscaped = SavedAllEscaped; 257 } 258 } 259 } 260 void VisitOpenMPCapturedStmt(const CapturedStmt *S, 261 ArrayRef<OMPClause *> Clauses, 262 bool IsCombinedParallelRegion) { 263 if (!S) 264 return; 265 for (const CapturedStmt::Capture &C : S->captures()) { 266 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 267 const ValueDecl *VD = C.getCapturedVar(); 268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 269 if (IsCombinedParallelRegion) { 270 // Check if the variable is privatized in the combined construct and 271 // those private copies must be shared in the inner parallel 272 // directive. 273 IsForCombinedParallelRegion = false; 274 for (const OMPClause *C : Clauses) { 275 if (!isOpenMPPrivate(C->getClauseKind()) || 276 C->getClauseKind() == OMPC_reduction || 277 C->getClauseKind() == OMPC_linear || 278 C->getClauseKind() == OMPC_private) 279 continue; 280 ArrayRef<const Expr *> Vars; 281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 282 Vars = PC->getVarRefs(); 283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 284 Vars = PC->getVarRefs(); 285 else 286 llvm_unreachable("Unexpected clause."); 287 for (const auto *E : Vars) { 288 const Decl *D = 289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 290 if (D == VD->getCanonicalDecl()) { 291 IsForCombinedParallelRegion = true; 292 break; 293 } 294 } 295 if (IsForCombinedParallelRegion) 296 break; 297 } 298 } 299 markAsEscaped(VD); 300 if (isa<OMPCapturedExprDecl>(VD)) 301 VisitValueDecl(VD); 302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 303 } 304 } 305 } 306 307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 308 assert(!GlobalizedRD && 309 "Record for globalized variables is built already."); 310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 312 if (IsInTTDRegion) 313 EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 314 else 315 EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 316 GlobalizedRD = ::buildRecordForGlobalizedVars( 317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 318 MappedDeclsFields, WarpSize); 319 } 320 321 public: 322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 323 ArrayRef<const ValueDecl *> TeamsReductions) 324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) { 325 } 326 virtual ~CheckVarsEscapingDeclContext() = default; 327 void VisitDeclStmt(const DeclStmt *S) { 328 if (!S) 329 return; 330 for (const Decl *D : S->decls()) 331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 332 VisitValueDecl(VD); 333 } 334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 335 if (!D) 336 return; 337 if (!D->hasAssociatedStmt()) 338 return; 339 if (const auto *S = 340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 341 // Do not analyze directives that do not actually require capturing, 342 // like `omp for` or `omp simd` directives. 343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 346 VisitStmt(S->getCapturedStmt()); 347 return; 348 } 349 VisitOpenMPCapturedStmt( 350 S, D->clauses(), 351 CaptureRegions.back() == OMPD_parallel && 352 isOpenMPDistributeDirective(D->getDirectiveKind())); 353 } 354 } 355 void VisitCapturedStmt(const CapturedStmt *S) { 356 if (!S) 357 return; 358 for (const CapturedStmt::Capture &C : S->captures()) { 359 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 360 const ValueDecl *VD = C.getCapturedVar(); 361 markAsEscaped(VD); 362 if (isa<OMPCapturedExprDecl>(VD)) 363 VisitValueDecl(VD); 364 } 365 } 366 } 367 void VisitLambdaExpr(const LambdaExpr *E) { 368 if (!E) 369 return; 370 for (const LambdaCapture &C : E->captures()) { 371 if (C.capturesVariable()) { 372 if (C.getCaptureKind() == LCK_ByRef) { 373 const ValueDecl *VD = C.getCapturedVar(); 374 markAsEscaped(VD); 375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 376 VisitValueDecl(VD); 377 } 378 } 379 } 380 } 381 void VisitBlockExpr(const BlockExpr *E) { 382 if (!E) 383 return; 384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 385 if (C.isByRef()) { 386 const VarDecl *VD = C.getVariable(); 387 markAsEscaped(VD); 388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 389 VisitValueDecl(VD); 390 } 391 } 392 } 393 void VisitCallExpr(const CallExpr *E) { 394 if (!E) 395 return; 396 for (const Expr *Arg : E->arguments()) { 397 if (!Arg) 398 continue; 399 if (Arg->isLValue()) { 400 const bool SavedAllEscaped = AllEscaped; 401 AllEscaped = true; 402 Visit(Arg); 403 AllEscaped = SavedAllEscaped; 404 } else { 405 Visit(Arg); 406 } 407 } 408 Visit(E->getCallee()); 409 } 410 void VisitDeclRefExpr(const DeclRefExpr *E) { 411 if (!E) 412 return; 413 const ValueDecl *VD = E->getDecl(); 414 if (AllEscaped) 415 markAsEscaped(VD); 416 if (isa<OMPCapturedExprDecl>(VD)) 417 VisitValueDecl(VD); 418 else if (VD->isInitCapture()) 419 VisitValueDecl(VD); 420 } 421 void VisitUnaryOperator(const UnaryOperator *E) { 422 if (!E) 423 return; 424 if (E->getOpcode() == UO_AddrOf) { 425 const bool SavedAllEscaped = AllEscaped; 426 AllEscaped = true; 427 Visit(E->getSubExpr()); 428 AllEscaped = SavedAllEscaped; 429 } else { 430 Visit(E->getSubExpr()); 431 } 432 } 433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 434 if (!E) 435 return; 436 if (E->getCastKind() == CK_ArrayToPointerDecay) { 437 const bool SavedAllEscaped = AllEscaped; 438 AllEscaped = true; 439 Visit(E->getSubExpr()); 440 AllEscaped = SavedAllEscaped; 441 } else { 442 Visit(E->getSubExpr()); 443 } 444 } 445 void VisitExpr(const Expr *E) { 446 if (!E) 447 return; 448 bool SavedAllEscaped = AllEscaped; 449 if (!E->isLValue()) 450 AllEscaped = false; 451 for (const Stmt *Child : E->children()) 452 if (Child) 453 Visit(Child); 454 AllEscaped = SavedAllEscaped; 455 } 456 void VisitStmt(const Stmt *S) { 457 if (!S) 458 return; 459 for (const Stmt *Child : S->children()) 460 if (Child) 461 Visit(Child); 462 } 463 464 /// Returns the record that handles all the escaped local variables and used 465 /// instead of their original storage. 466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 467 if (!GlobalizedRD) 468 buildRecordForGlobalizedVars(IsInTTDRegion); 469 return GlobalizedRD; 470 } 471 472 /// Returns the field in the globalized record for the escaped variable. 473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 474 assert(GlobalizedRD && 475 "Record for globalized variables must be generated already."); 476 return MappedDeclsFields.lookup(VD); 477 } 478 479 /// Returns the list of the escaped local variables/parameters. 480 ArrayRef<const ValueDecl *> getEscapedDecls() const { 481 return EscapedDecls.getArrayRef(); 482 } 483 484 /// Checks if the escaped local variable is actually a parameter passed by 485 /// value. 486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 487 return EscapedParameters; 488 } 489 490 /// Returns the list of the escaped variables with the variably modified 491 /// types. 492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 493 return EscapedVariableLengthDecls.getArrayRef(); 494 } 495 496 /// Returns the list of the delayed variables with the variably modified 497 /// types. 498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { 499 return DelayedVariableLengthDecls.getArrayRef(); 500 } 501 }; 502 } // anonymous namespace 503 504 CGOpenMPRuntimeGPU::ExecutionMode 505 CGOpenMPRuntimeGPU::getExecutionMode() const { 506 return CurrentExecutionMode; 507 } 508 509 CGOpenMPRuntimeGPU::DataSharingMode 510 CGOpenMPRuntimeGPU::getDataSharingMode() const { 511 return CurrentDataSharingMode; 512 } 513 514 /// Check for inner (nested) SPMD construct, if any 515 static bool hasNestedSPMDDirective(ASTContext &Ctx, 516 const OMPExecutableDirective &D) { 517 const auto *CS = D.getInnermostCapturedStmt(); 518 const auto *Body = 519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 521 522 if (const auto *NestedDir = 523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 525 switch (D.getDirectiveKind()) { 526 case OMPD_target: 527 if (isOpenMPParallelDirective(DKind)) 528 return true; 529 if (DKind == OMPD_teams) { 530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 531 /*IgnoreCaptured=*/true); 532 if (!Body) 533 return false; 534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 535 if (const auto *NND = 536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 537 DKind = NND->getDirectiveKind(); 538 if (isOpenMPParallelDirective(DKind)) 539 return true; 540 } 541 } 542 return false; 543 case OMPD_target_teams: 544 return isOpenMPParallelDirective(DKind); 545 case OMPD_target_simd: 546 case OMPD_target_parallel: 547 case OMPD_target_parallel_for: 548 case OMPD_target_parallel_for_simd: 549 case OMPD_target_teams_distribute: 550 case OMPD_target_teams_distribute_simd: 551 case OMPD_target_teams_distribute_parallel_for: 552 case OMPD_target_teams_distribute_parallel_for_simd: 553 case OMPD_parallel: 554 case OMPD_for: 555 case OMPD_parallel_for: 556 case OMPD_parallel_master: 557 case OMPD_parallel_sections: 558 case OMPD_for_simd: 559 case OMPD_parallel_for_simd: 560 case OMPD_cancel: 561 case OMPD_cancellation_point: 562 case OMPD_ordered: 563 case OMPD_threadprivate: 564 case OMPD_allocate: 565 case OMPD_task: 566 case OMPD_simd: 567 case OMPD_sections: 568 case OMPD_section: 569 case OMPD_single: 570 case OMPD_master: 571 case OMPD_critical: 572 case OMPD_taskyield: 573 case OMPD_barrier: 574 case OMPD_taskwait: 575 case OMPD_taskgroup: 576 case OMPD_atomic: 577 case OMPD_flush: 578 case OMPD_depobj: 579 case OMPD_scan: 580 case OMPD_teams: 581 case OMPD_target_data: 582 case OMPD_target_exit_data: 583 case OMPD_target_enter_data: 584 case OMPD_distribute: 585 case OMPD_distribute_simd: 586 case OMPD_distribute_parallel_for: 587 case OMPD_distribute_parallel_for_simd: 588 case OMPD_teams_distribute: 589 case OMPD_teams_distribute_simd: 590 case OMPD_teams_distribute_parallel_for: 591 case OMPD_teams_distribute_parallel_for_simd: 592 case OMPD_target_update: 593 case OMPD_declare_simd: 594 case OMPD_declare_variant: 595 case OMPD_begin_declare_variant: 596 case OMPD_end_declare_variant: 597 case OMPD_declare_target: 598 case OMPD_end_declare_target: 599 case OMPD_declare_reduction: 600 case OMPD_declare_mapper: 601 case OMPD_taskloop: 602 case OMPD_taskloop_simd: 603 case OMPD_master_taskloop: 604 case OMPD_master_taskloop_simd: 605 case OMPD_parallel_master_taskloop: 606 case OMPD_parallel_master_taskloop_simd: 607 case OMPD_requires: 608 case OMPD_unknown: 609 default: 610 llvm_unreachable("Unexpected directive."); 611 } 612 } 613 614 return false; 615 } 616 617 static bool supportsSPMDExecutionMode(ASTContext &Ctx, 618 const OMPExecutableDirective &D) { 619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 620 switch (DirectiveKind) { 621 case OMPD_target: 622 case OMPD_target_teams: 623 return hasNestedSPMDDirective(Ctx, D); 624 case OMPD_target_parallel_loop: 625 case OMPD_target_parallel: 626 case OMPD_target_parallel_for: 627 case OMPD_target_parallel_for_simd: 628 case OMPD_target_teams_distribute_parallel_for: 629 case OMPD_target_teams_distribute_parallel_for_simd: 630 case OMPD_target_simd: 631 case OMPD_target_teams_distribute_simd: 632 return true; 633 case OMPD_target_teams_distribute: 634 return false; 635 case OMPD_target_teams_loop: 636 // Whether this is true or not depends on how the directive will 637 // eventually be emitted. 638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D)) 639 return TTLD->canBeParallelFor(); 640 return false; 641 case OMPD_parallel: 642 case OMPD_for: 643 case OMPD_parallel_for: 644 case OMPD_parallel_master: 645 case OMPD_parallel_sections: 646 case OMPD_for_simd: 647 case OMPD_parallel_for_simd: 648 case OMPD_cancel: 649 case OMPD_cancellation_point: 650 case OMPD_ordered: 651 case OMPD_threadprivate: 652 case OMPD_allocate: 653 case OMPD_task: 654 case OMPD_simd: 655 case OMPD_sections: 656 case OMPD_section: 657 case OMPD_single: 658 case OMPD_master: 659 case OMPD_critical: 660 case OMPD_taskyield: 661 case OMPD_barrier: 662 case OMPD_taskwait: 663 case OMPD_taskgroup: 664 case OMPD_atomic: 665 case OMPD_flush: 666 case OMPD_depobj: 667 case OMPD_scan: 668 case OMPD_teams: 669 case OMPD_target_data: 670 case OMPD_target_exit_data: 671 case OMPD_target_enter_data: 672 case OMPD_distribute: 673 case OMPD_distribute_simd: 674 case OMPD_distribute_parallel_for: 675 case OMPD_distribute_parallel_for_simd: 676 case OMPD_teams_distribute: 677 case OMPD_teams_distribute_simd: 678 case OMPD_teams_distribute_parallel_for: 679 case OMPD_teams_distribute_parallel_for_simd: 680 case OMPD_target_update: 681 case OMPD_declare_simd: 682 case OMPD_declare_variant: 683 case OMPD_begin_declare_variant: 684 case OMPD_end_declare_variant: 685 case OMPD_declare_target: 686 case OMPD_end_declare_target: 687 case OMPD_declare_reduction: 688 case OMPD_declare_mapper: 689 case OMPD_taskloop: 690 case OMPD_taskloop_simd: 691 case OMPD_master_taskloop: 692 case OMPD_master_taskloop_simd: 693 case OMPD_parallel_master_taskloop: 694 case OMPD_parallel_master_taskloop_simd: 695 case OMPD_requires: 696 case OMPD_unknown: 697 default: 698 break; 699 } 700 llvm_unreachable( 701 "Unknown programming model for OpenMP directive on NVPTX target."); 702 } 703 704 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 705 StringRef ParentName, 706 llvm::Function *&OutlinedFn, 707 llvm::Constant *&OutlinedFnID, 708 bool IsOffloadEntry, 709 const RegionCodeGenTy &CodeGen) { 710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); 711 EntryFunctionState EST; 712 WrapperFunctionsMap.clear(); 713 714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 715 assert(!IsBareKernel && "bare kernel should not be at generic mode"); 716 717 // Emit target region as a standalone region. 718 class NVPTXPrePostActionTy : public PrePostActionTy { 719 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 720 const OMPExecutableDirective &D; 721 722 public: 723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 724 const OMPExecutableDirective &D) 725 : EST(EST), D(D) {} 726 void Enter(CodeGenFunction &CGF) override { 727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); 729 // Skip target region initialization. 730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 731 } 732 void Exit(CodeGenFunction &CGF) override { 733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 734 RT.clearLocThreadIdInsertPt(CGF); 735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); 736 } 737 } Action(EST, D); 738 CodeGen.setAction(Action); 739 IsInTTDRegion = true; 740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 741 IsOffloadEntry, CodeGen); 742 IsInTTDRegion = false; 743 } 744 745 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, 746 CodeGenFunction &CGF, 747 EntryFunctionState &EST, bool IsSPMD) { 748 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1, 749 MaxTeamsVal = -1; 750 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal, 751 MinTeamsVal, MaxTeamsVal); 752 753 CGBuilderTy &Bld = CGF.Builder; 754 Bld.restoreIP(OMPBuilder.createTargetInit( 755 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal)); 756 if (!IsSPMD) 757 emitGenericVarsProlog(CGF, EST.Loc); 758 } 759 760 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, 761 EntryFunctionState &EST, 762 bool IsSPMD) { 763 if (!IsSPMD) 764 emitGenericVarsEpilog(CGF); 765 766 // This is temporary until we remove the fixed sized buffer. 767 ASTContext &C = CGM.getContext(); 768 RecordDecl *StaticRD = C.buildImplicitRecord( 769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); 770 StaticRD->startDefinition(); 771 for (const RecordDecl *TeamReductionRec : TeamsReductions) { 772 QualType RecTy = C.getRecordType(TeamReductionRec); 773 auto *Field = FieldDecl::Create( 774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 776 /*BW=*/nullptr, /*Mutable=*/false, 777 /*InitStyle=*/ICIS_NoInit); 778 Field->setAccess(AS_public); 779 StaticRD->addDecl(Field); 780 } 781 StaticRD->completeDefinition(); 782 QualType StaticTy = C.getRecordType(StaticRD); 783 llvm::Type *LLVMReductionsBufferTy = 784 CGM.getTypes().ConvertTypeForMem(StaticTy); 785 const auto &DL = CGM.getModule().getDataLayout(); 786 uint64_t ReductionDataSize = 787 TeamsReductions.empty() 788 ? 0 789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); 790 CGBuilderTy &Bld = CGF.Builder; 791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, 792 C.getLangOpts().OpenMPCUDAReductionBufNum); 793 TeamsReductions.clear(); 794 } 795 796 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 797 StringRef ParentName, 798 llvm::Function *&OutlinedFn, 799 llvm::Constant *&OutlinedFnID, 800 bool IsOffloadEntry, 801 const RegionCodeGenTy &CodeGen) { 802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); 803 EntryFunctionState EST; 804 805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 806 807 // Emit target region as a standalone region. 808 class NVPTXPrePostActionTy : public PrePostActionTy { 809 CGOpenMPRuntimeGPU &RT; 810 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 811 bool IsBareKernel; 812 DataSharingMode Mode; 813 const OMPExecutableDirective &D; 814 815 public: 816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 817 CGOpenMPRuntimeGPU::EntryFunctionState &EST, 818 bool IsBareKernel, const OMPExecutableDirective &D) 819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel), 820 Mode(RT.CurrentDataSharingMode), D(D) {} 821 void Enter(CodeGenFunction &CGF) override { 822 if (IsBareKernel) { 823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; 824 return; 825 } 826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); 827 // Skip target region initialization. 828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 829 } 830 void Exit(CodeGenFunction &CGF) override { 831 if (IsBareKernel) { 832 RT.CurrentDataSharingMode = Mode; 833 return; 834 } 835 RT.clearLocThreadIdInsertPt(CGF); 836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); 837 } 838 } Action(*this, EST, IsBareKernel, D); 839 CodeGen.setAction(Action); 840 IsInTTDRegion = true; 841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 842 IsOffloadEntry, CodeGen); 843 IsInTTDRegion = false; 844 } 845 846 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 847 const OMPExecutableDirective &D, StringRef ParentName, 848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 850 if (!IsOffloadEntry) // Nothing to do. 851 return; 852 853 assert(!ParentName.empty() && "Invalid target region parent name!"); 854 855 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 857 if (Mode || IsBareKernel) 858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 859 CodeGen); 860 else 861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 862 CodeGen); 863 } 864 865 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 866 : CGOpenMPRuntime(CGM) { 867 llvm::OpenMPIRBuilderConfig Config( 868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), 869 CGM.getLangOpts().OpenMPOffloadMandatory, 870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, 871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); 872 OMPBuilder.setConfig(Config); 873 874 if (!CGM.getLangOpts().OpenMPIsTargetDevice) 875 llvm_unreachable("OpenMP can only handle device code."); 876 877 if (CGM.getLangOpts().OpenMPCUDAMode) 878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; 879 880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); 881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) 882 return; 883 884 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, 885 "__omp_rtl_debug_kind"); 886 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, 887 "__omp_rtl_assume_teams_oversubscription"); 888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, 889 "__omp_rtl_assume_threads_oversubscription"); 890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, 891 "__omp_rtl_assume_no_thread_state"); 892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, 893 "__omp_rtl_assume_no_nested_parallelism"); 894 } 895 896 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 897 ProcBindKind ProcBind, 898 SourceLocation Loc) { 899 // Nothing to do. 900 } 901 902 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 903 llvm::Value *NumThreads, 904 SourceLocation Loc) { 905 // Nothing to do. 906 } 907 908 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 909 const Expr *NumTeams, 910 const Expr *ThreadLimit, 911 SourceLocation Loc) {} 912 913 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 914 CodeGenFunction &CGF, const OMPExecutableDirective &D, 915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 916 const RegionCodeGenTy &CodeGen) { 917 // Emit target region as a standalone region. 918 bool PrevIsInTTDRegion = IsInTTDRegion; 919 IsInTTDRegion = false; 920 auto *OutlinedFun = 921 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 922 CGF, D, ThreadIDVar, InnermostKind, CodeGen)); 923 IsInTTDRegion = PrevIsInTTDRegion; 924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { 925 llvm::Function *WrapperFun = 926 createParallelDataSharingWrapper(OutlinedFun, D); 927 WrapperFunctionsMap[OutlinedFun] = WrapperFun; 928 } 929 930 return OutlinedFun; 931 } 932 933 /// Get list of lastprivate variables from the teams distribute ... or 934 /// teams {distribute ...} directives. 935 static void 936 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 939 "expected teams directive."); 940 const OMPExecutableDirective *Dir = &D; 941 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 943 Ctx, 944 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 945 /*IgnoreCaptured=*/true))) { 946 Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 947 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 948 Dir = nullptr; 949 } 950 } 951 if (!Dir) 952 return; 953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 954 for (const Expr *E : C->getVarRefs()) 955 Vars.push_back(getPrivateItem(E)); 956 } 957 } 958 959 /// Get list of reduction variables from the teams ... directives. 960 static void 961 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 964 "expected teams directive."); 965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 966 for (const Expr *E : C->privates()) 967 Vars.push_back(getPrivateItem(E)); 968 } 969 } 970 971 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 972 CodeGenFunction &CGF, const OMPExecutableDirective &D, 973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 974 const RegionCodeGenTy &CodeGen) { 975 SourceLocation Loc = D.getBeginLoc(); 976 977 const RecordDecl *GlobalizedRD = nullptr; 978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; 981 // Globalize team reductions variable unconditionally in all modes. 982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 983 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 985 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 986 if (!LastPrivatesReductions.empty()) { 987 GlobalizedRD = ::buildRecordForGlobalizedVars( 988 CGM.getContext(), std::nullopt, LastPrivatesReductions, 989 MappedDeclsFields, WarpSize); 990 } 991 } else if (!LastPrivatesReductions.empty()) { 992 assert(!TeamAndReductions.first && 993 "Previous team declaration is not expected."); 994 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 995 std::swap(TeamAndReductions.second, LastPrivatesReductions); 996 } 997 998 // Emit target region as a standalone region. 999 class NVPTXPrePostActionTy : public PrePostActionTy { 1000 SourceLocation &Loc; 1001 const RecordDecl *GlobalizedRD; 1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1003 &MappedDeclsFields; 1004 1005 public: 1006 NVPTXPrePostActionTy( 1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1009 &MappedDeclsFields) 1010 : Loc(Loc), GlobalizedRD(GlobalizedRD), 1011 MappedDeclsFields(MappedDeclsFields) {} 1012 void Enter(CodeGenFunction &CGF) override { 1013 auto &Rt = 1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1015 if (GlobalizedRD) { 1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1017 I->getSecond().MappedParams = 1018 std::make_unique<CodeGenFunction::OMPMapVars>(); 1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1020 for (const auto &Pair : MappedDeclsFields) { 1021 assert(Pair.getFirst()->isCanonicalDecl() && 1022 "Expected canonical declaration"); 1023 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData())); 1024 } 1025 } 1026 Rt.emitGenericVarsProlog(CGF, Loc); 1027 } 1028 void Exit(CodeGenFunction &CGF) override { 1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1030 .emitGenericVarsEpilog(CGF); 1031 } 1032 } Action(Loc, GlobalizedRD, MappedDeclsFields); 1033 CodeGen.setAction(Action); 1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen); 1036 1037 return OutlinedFun; 1038 } 1039 1040 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1041 SourceLocation Loc) { 1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1043 return; 1044 1045 CGBuilderTy &Bld = CGF.Builder; 1046 1047 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1048 if (I == FunctionGlobalizedDecls.end()) 1049 return; 1050 1051 for (auto &Rec : I->getSecond().LocalVarData) { 1052 const auto *VD = cast<VarDecl>(Rec.first); 1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1054 QualType VarTy = VD->getType(); 1055 1056 // Get the local allocation of a firstprivate variable before sharing 1057 llvm::Value *ParValue; 1058 if (EscapedParam) { 1059 LValue ParLVal = 1060 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1061 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1062 } 1063 1064 // Allocate space for the variable to be globalized 1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; 1066 llvm::CallBase *VoidPtr = 1067 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1068 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1069 AllocArgs, VD->getName()); 1070 // FIXME: We should use the variables actual alignment as an argument. 1071 VoidPtr->addRetAttr(llvm::Attribute::get( 1072 CGM.getLLVMContext(), llvm::Attribute::Alignment, 1073 CGM.getContext().getTargetInfo().getNewAlign() / 8)); 1074 1075 // Cast the void pointer and get the address of the globalized variable. 1076 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo(); 1077 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1078 VoidPtr, VarPtrTy, VD->getName() + "_on_stack"); 1079 LValue VarAddr = 1080 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); 1081 Rec.second.PrivateAddr = VarAddr.getAddress(); 1082 Rec.second.GlobalizedVal = VoidPtr; 1083 1084 // Assign the local allocation to the newly globalized location. 1085 if (EscapedParam) { 1086 CGF.EmitStoreOfScalar(ParValue, VarAddr); 1087 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); 1088 } 1089 if (auto *DI = CGF.getDebugInfo()) 1090 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); 1091 } 1092 1093 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { 1094 const auto *VD = cast<VarDecl>(ValueD); 1095 std::pair<llvm::Value *, llvm::Value *> AddrSizePair = 1096 getKmpcAllocShared(CGF, VD); 1097 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); 1098 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), 1099 CGM.getContext().getDeclAlign(VD), 1100 AlignmentSource::Decl); 1101 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress()); 1102 } 1103 I->getSecond().MappedParams->apply(CGF); 1104 } 1105 1106 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, 1107 const VarDecl *VD) const { 1108 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1109 if (I == FunctionGlobalizedDecls.end()) 1110 return false; 1111 1112 // Check variable declaration is delayed: 1113 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); 1114 } 1115 1116 std::pair<llvm::Value *, llvm::Value *> 1117 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, 1118 const VarDecl *VD) { 1119 CGBuilderTy &Bld = CGF.Builder; 1120 1121 // Compute size and alignment. 1122 llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1123 CharUnits Align = CGM.getContext().getDeclAlign(VD); 1124 Size = Bld.CreateNUWAdd( 1125 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1126 llvm::Value *AlignVal = 1127 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1128 Size = Bld.CreateUDiv(Size, AlignVal); 1129 Size = Bld.CreateNUWMul(Size, AlignVal); 1130 1131 // Allocate space for this VLA object to be globalized. 1132 llvm::Value *AllocArgs[] = {Size}; 1133 llvm::CallBase *VoidPtr = 1134 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1135 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1136 AllocArgs, VD->getName()); 1137 VoidPtr->addRetAttr(llvm::Attribute::get( 1138 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); 1139 1140 return std::make_pair(VoidPtr, Size); 1141 } 1142 1143 void CGOpenMPRuntimeGPU::getKmpcFreeShared( 1144 CodeGenFunction &CGF, 1145 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { 1146 // Deallocate the memory for each globalized VLA object 1147 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1148 CGM.getModule(), OMPRTL___kmpc_free_shared), 1149 {AddrSizePair.first, AddrSizePair.second}); 1150 } 1151 1152 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { 1153 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1154 return; 1155 1156 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1157 if (I != FunctionGlobalizedDecls.end()) { 1158 // Deallocate the memory for each globalized VLA object that was 1159 // globalized in the prolog (i.e. emitGenericVarsProlog). 1160 for (const auto &AddrSizePair : 1161 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 1162 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1163 CGM.getModule(), OMPRTL___kmpc_free_shared), 1164 {AddrSizePair.first, AddrSizePair.second}); 1165 } 1166 // Deallocate the memory for each globalized value 1167 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { 1168 const auto *VD = cast<VarDecl>(Rec.first); 1169 I->getSecond().MappedParams->restore(CGF); 1170 1171 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, 1172 CGF.getTypeSize(VD->getType())}; 1173 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1174 CGM.getModule(), OMPRTL___kmpc_free_shared), 1175 FreeArgs); 1176 } 1177 } 1178 } 1179 1180 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 1181 const OMPExecutableDirective &D, 1182 SourceLocation Loc, 1183 llvm::Function *OutlinedFn, 1184 ArrayRef<llvm::Value *> CapturedVars) { 1185 if (!CGF.HaveInsertPoint()) 1186 return; 1187 1188 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 1189 1190 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1191 /*Name=*/".zero.addr"); 1192 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1193 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 1194 // We don't emit any thread id function call in bare kernel, but because the 1195 // outlined function has a pointer argument, we emit a nullptr here. 1196 if (IsBareKernel) 1197 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); 1198 else 1199 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); 1200 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 1201 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 1202 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 1203 } 1204 1205 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 1206 SourceLocation Loc, 1207 llvm::Function *OutlinedFn, 1208 ArrayRef<llvm::Value *> CapturedVars, 1209 const Expr *IfCond, 1210 llvm::Value *NumThreads) { 1211 if (!CGF.HaveInsertPoint()) 1212 return; 1213 1214 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, 1215 NumThreads](CodeGenFunction &CGF, 1216 PrePostActionTy &Action) { 1217 CGBuilderTy &Bld = CGF.Builder; 1218 llvm::Value *NumThreadsVal = NumThreads; 1219 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 1220 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 1221 if (WFn) 1222 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 1223 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 1224 1225 // Create a private scope that will globalize the arguments 1226 // passed from the outside of the target region. 1227 // TODO: Is that needed? 1228 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 1229 1230 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 1231 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 1232 "captured_vars_addrs"); 1233 // There's something to share. 1234 if (!CapturedVars.empty()) { 1235 // Prepare for parallel region. Indicate the outlined function. 1236 ASTContext &Ctx = CGF.getContext(); 1237 unsigned Idx = 0; 1238 for (llvm::Value *V : CapturedVars) { 1239 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 1240 llvm::Value *PtrV; 1241 if (V->getType()->isIntegerTy()) 1242 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 1243 else 1244 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 1245 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 1246 Ctx.getPointerType(Ctx.VoidPtrTy)); 1247 ++Idx; 1248 } 1249 } 1250 1251 llvm::Value *IfCondVal = nullptr; 1252 if (IfCond) 1253 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 1254 /* isSigned */ false); 1255 else 1256 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 1257 1258 if (!NumThreadsVal) 1259 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); 1260 else 1261 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty), 1262 1263 assert(IfCondVal && "Expected a value"); 1264 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1265 llvm::Value *Args[] = { 1266 RTLoc, 1267 getThreadID(CGF, Loc), 1268 IfCondVal, 1269 NumThreadsVal, 1270 llvm::ConstantInt::get(CGF.Int32Ty, -1), 1271 FnPtr, 1272 ID, 1273 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF), 1274 CGF.VoidPtrPtrTy), 1275 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 1276 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1277 CGM.getModule(), OMPRTL___kmpc_parallel_51), 1278 Args); 1279 }; 1280 1281 RegionCodeGenTy RCG(ParallelGen); 1282 RCG(CGF); 1283 } 1284 1285 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 1286 // Always emit simple barriers! 1287 if (!CGF.HaveInsertPoint()) 1288 return; 1289 // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 1290 // This function does not use parameters, so we can emit just default values. 1291 llvm::Value *Args[] = { 1292 llvm::ConstantPointerNull::get( 1293 cast<llvm::PointerType>(getIdentTyPointerTy())), 1294 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 1295 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1296 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 1297 Args); 1298 } 1299 1300 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 1301 SourceLocation Loc, 1302 OpenMPDirectiveKind Kind, bool, 1303 bool) { 1304 // Always emit simple barriers! 1305 if (!CGF.HaveInsertPoint()) 1306 return; 1307 // Build call __kmpc_cancel_barrier(loc, thread_id); 1308 unsigned Flags = getDefaultFlagsForBarriers(Kind); 1309 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 1310 getThreadID(CGF, Loc)}; 1311 1312 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1313 CGM.getModule(), OMPRTL___kmpc_barrier), 1314 Args); 1315 } 1316 1317 void CGOpenMPRuntimeGPU::emitCriticalRegion( 1318 CodeGenFunction &CGF, StringRef CriticalName, 1319 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 1320 const Expr *Hint) { 1321 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 1322 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 1323 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 1324 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 1325 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 1326 1327 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1328 1329 // Get the mask of active threads in the warp. 1330 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1331 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 1332 // Fetch team-local id of the thread. 1333 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1334 1335 // Get the width of the team. 1336 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 1337 1338 // Initialize the counter variable for the loop. 1339 QualType Int32Ty = 1340 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 1341 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 1342 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 1343 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 1344 /*isInit=*/true); 1345 1346 // Block checks if loop counter exceeds upper bound. 1347 CGF.EmitBlock(LoopBB); 1348 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1349 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 1350 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 1351 1352 // Block tests which single thread should execute region, and which threads 1353 // should go straight to synchronisation point. 1354 CGF.EmitBlock(TestBB); 1355 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1356 llvm::Value *CmpThreadToCounter = 1357 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 1358 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 1359 1360 // Block emits the body of the critical region. 1361 CGF.EmitBlock(BodyBB); 1362 1363 // Output the critical statement. 1364 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 1365 Hint); 1366 1367 // After the body surrounded by the critical region, the single executing 1368 // thread will jump to the synchronisation point. 1369 // Block waits for all threads in current team to finish then increments the 1370 // counter variable and returns to the loop. 1371 CGF.EmitBlock(SyncBB); 1372 // Reconverge active threads in the warp. 1373 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1374 CGM.getModule(), OMPRTL___kmpc_syncwarp), 1375 Mask); 1376 1377 llvm::Value *IncCounterVal = 1378 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 1379 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 1380 CGF.EmitBranch(LoopBB); 1381 1382 // Block that is reached when all threads in the team complete the region. 1383 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 1384 } 1385 1386 /// Cast value to the specified type. 1387 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 1388 QualType ValTy, QualType CastTy, 1389 SourceLocation Loc) { 1390 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 1391 "Cast type must sized."); 1392 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 1393 "Val type must sized."); 1394 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 1395 if (ValTy == CastTy) 1396 return Val; 1397 if (CGF.getContext().getTypeSizeInChars(ValTy) == 1398 CGF.getContext().getTypeSizeInChars(CastTy)) 1399 return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 1400 if (CastTy->isIntegerType() && ValTy->isIntegerType()) 1401 return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 1402 CastTy->hasSignedIntegerRepresentation()); 1403 Address CastItem = CGF.CreateMemTemp(CastTy); 1404 Address ValCastItem = CastItem.withElementType(Val->getType()); 1405 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 1406 LValueBaseInfo(AlignmentSource::Type), 1407 TBAAAccessInfo()); 1408 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 1409 LValueBaseInfo(AlignmentSource::Type), 1410 TBAAAccessInfo()); 1411 } 1412 1413 /// 1414 /// Design of OpenMP reductions on the GPU 1415 /// 1416 /// Consider a typical OpenMP program with one or more reduction 1417 /// clauses: 1418 /// 1419 /// float foo; 1420 /// double bar; 1421 /// #pragma omp target teams distribute parallel for \ 1422 /// reduction(+:foo) reduction(*:bar) 1423 /// for (int i = 0; i < N; i++) { 1424 /// foo += A[i]; bar *= B[i]; 1425 /// } 1426 /// 1427 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1428 /// all teams. In our OpenMP implementation on the NVPTX device an 1429 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1430 /// within a team are mapped to CUDA threads within a threadblock. 1431 /// Our goal is to efficiently aggregate values across all OpenMP 1432 /// threads such that: 1433 /// 1434 /// - the compiler and runtime are logically concise, and 1435 /// - the reduction is performed efficiently in a hierarchical 1436 /// manner as follows: within OpenMP threads in the same warp, 1437 /// across warps in a threadblock, and finally across teams on 1438 /// the NVPTX device. 1439 /// 1440 /// Introduction to Decoupling 1441 /// 1442 /// We would like to decouple the compiler and the runtime so that the 1443 /// latter is ignorant of the reduction variables (number, data types) 1444 /// and the reduction operators. This allows a simpler interface 1445 /// and implementation while still attaining good performance. 1446 /// 1447 /// Pseudocode for the aforementioned OpenMP program generated by the 1448 /// compiler is as follows: 1449 /// 1450 /// 1. Create private copies of reduction variables on each OpenMP 1451 /// thread: 'foo_private', 'bar_private' 1452 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1453 /// to it and writes the result in 'foo_private' and 'bar_private' 1454 /// respectively. 1455 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1456 /// and store the result on the team master: 1457 /// 1458 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1459 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1460 /// 1461 /// where: 1462 /// struct ReduceData { 1463 /// double *foo; 1464 /// double *bar; 1465 /// } reduceData 1466 /// reduceData.foo = &foo_private 1467 /// reduceData.bar = &bar_private 1468 /// 1469 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1470 /// auxiliary functions generated by the compiler that operate on 1471 /// variables of type 'ReduceData'. They aid the runtime perform 1472 /// algorithmic steps in a data agnostic manner. 1473 /// 1474 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1475 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1476 /// same warp. It takes the following arguments as input: 1477 /// 1478 /// a. variable of type 'ReduceData' on the calling lane, 1479 /// b. its lane_id, 1480 /// c. an offset relative to the current lane_id to generate a 1481 /// remote_lane_id. The remote lane contains the second 1482 /// variable of type 'ReduceData' that is to be reduced. 1483 /// d. an algorithm version parameter determining which reduction 1484 /// algorithm to use. 1485 /// 1486 /// 'shuffleReduceFn' retrieves data from the remote lane using 1487 /// efficient GPU shuffle intrinsics and reduces, using the 1488 /// algorithm specified by the 4th parameter, the two operands 1489 /// element-wise. The result is written to the first operand. 1490 /// 1491 /// Different reduction algorithms are implemented in different 1492 /// runtime functions, all calling 'shuffleReduceFn' to perform 1493 /// the essential reduction step. Therefore, based on the 4th 1494 /// parameter, this function behaves slightly differently to 1495 /// cooperate with the runtime to ensure correctness under 1496 /// different circumstances. 1497 /// 1498 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1499 /// reduced variables across warps. It tunnels, through CUDA 1500 /// shared memory, the thread-private data of type 'ReduceData' 1501 /// from lane 0 of each warp to a lane in the first warp. 1502 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1503 /// The last team writes the global reduced value to memory. 1504 /// 1505 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1506 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1507 /// scratchpadCopyFn, loadAndReduceFn) 1508 /// 1509 /// 'scratchpadCopyFn' is a helper that stores reduced 1510 /// data from the team master to a scratchpad array in 1511 /// global memory. 1512 /// 1513 /// 'loadAndReduceFn' is a helper that loads data from 1514 /// the scratchpad array and reduces it with the input 1515 /// operand. 1516 /// 1517 /// These compiler generated functions hide address 1518 /// calculation and alignment information from the runtime. 1519 /// 5. if ret == 1: 1520 /// The team master of the last team stores the reduced 1521 /// result to the globals in memory. 1522 /// foo += reduceData.foo; bar *= reduceData.bar 1523 /// 1524 /// 1525 /// Warp Reduction Algorithms 1526 /// 1527 /// On the warp level, we have three algorithms implemented in the 1528 /// OpenMP runtime depending on the number of active lanes: 1529 /// 1530 /// Full Warp Reduction 1531 /// 1532 /// The reduce algorithm within a warp where all lanes are active 1533 /// is implemented in the runtime as follows: 1534 /// 1535 /// full_warp_reduce(void *reduce_data, 1536 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1537 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1538 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1539 /// } 1540 /// 1541 /// The algorithm completes in log(2, WARPSIZE) steps. 1542 /// 1543 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1544 /// not used therefore we save instructions by not retrieving lane_id 1545 /// from the corresponding special registers. The 4th parameter, which 1546 /// represents the version of the algorithm being used, is set to 0 to 1547 /// signify full warp reduction. 1548 /// 1549 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1550 /// 1551 /// #reduce_elem refers to an element in the local lane's data structure 1552 /// #remote_elem is retrieved from a remote lane 1553 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1554 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1555 /// 1556 /// Contiguous Partial Warp Reduction 1557 /// 1558 /// This reduce algorithm is used within a warp where only the first 1559 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1560 /// number of OpenMP threads in a parallel region is not a multiple of 1561 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1562 /// 1563 /// void 1564 /// contiguous_partial_reduce(void *reduce_data, 1565 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1566 /// int size, int lane_id) { 1567 /// int curr_size; 1568 /// int offset; 1569 /// curr_size = size; 1570 /// mask = curr_size/2; 1571 /// while (offset>0) { 1572 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1573 /// curr_size = (curr_size+1)/2; 1574 /// offset = curr_size/2; 1575 /// } 1576 /// } 1577 /// 1578 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1579 /// 1580 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1581 /// if (lane_id < offset) 1582 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1583 /// else 1584 /// reduce_elem = remote_elem 1585 /// 1586 /// This algorithm assumes that the data to be reduced are located in a 1587 /// contiguous subset of lanes starting from the first. When there is 1588 /// an odd number of active lanes, the data in the last lane is not 1589 /// aggregated with any other lane's dat but is instead copied over. 1590 /// 1591 /// Dispersed Partial Warp Reduction 1592 /// 1593 /// This algorithm is used within a warp when any discontiguous subset of 1594 /// lanes are active. It is used to implement the reduction operation 1595 /// across lanes in an OpenMP simd region or in a nested parallel region. 1596 /// 1597 /// void 1598 /// dispersed_partial_reduce(void *reduce_data, 1599 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1600 /// int size, remote_id; 1601 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1602 /// do { 1603 /// remote_id = next_active_lane_id_right_after_me(); 1604 /// # the above function returns 0 of no active lane 1605 /// # is present right after the current lane. 1606 /// size = number_of_active_lanes_in_this_warp(); 1607 /// logical_lane_id /= 2; 1608 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1609 /// remote_id-1-threadIdx.x, 2); 1610 /// } while (logical_lane_id % 2 == 0 && size > 1); 1611 /// } 1612 /// 1613 /// There is no assumption made about the initial state of the reduction. 1614 /// Any number of lanes (>=1) could be active at any position. The reduction 1615 /// result is returned in the first active lane. 1616 /// 1617 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1618 /// 1619 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1620 /// if (lane_id % 2 == 0 && offset > 0) 1621 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1622 /// else 1623 /// reduce_elem = remote_elem 1624 /// 1625 /// 1626 /// Intra-Team Reduction 1627 /// 1628 /// This function, as implemented in the runtime call 1629 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1630 /// threads in a team. It first reduces within a warp using the 1631 /// aforementioned algorithms. We then proceed to gather all such 1632 /// reduced values at the first warp. 1633 /// 1634 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1635 /// data from each of the "warp master" (zeroth lane of each warp, where 1636 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1637 /// a mathematical sense) the problem of reduction across warp masters in 1638 /// a block to the problem of warp reduction. 1639 /// 1640 /// 1641 /// Inter-Team Reduction 1642 /// 1643 /// Once a team has reduced its data to a single value, it is stored in 1644 /// a global scratchpad array. Since each team has a distinct slot, this 1645 /// can be done without locking. 1646 /// 1647 /// The last team to write to the scratchpad array proceeds to reduce the 1648 /// scratchpad array. One or more workers in the last team use the helper 1649 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1650 /// the k'th worker reduces every k'th element. 1651 /// 1652 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1653 /// reduce across workers and compute a globally reduced value. 1654 /// 1655 void CGOpenMPRuntimeGPU::emitReduction( 1656 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 1657 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 1658 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 1659 if (!CGF.HaveInsertPoint()) 1660 return; 1661 1662 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 1663 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); 1664 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 1665 1666 ASTContext &C = CGM.getContext(); 1667 1668 if (Options.SimpleReduction) { 1669 assert(!TeamsReduction && !ParallelReduction && 1670 "Invalid reduction selection in emitReduction."); 1671 (void)ParallelReduction; 1672 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 1673 ReductionOps, Options); 1674 return; 1675 } 1676 1677 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 1678 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 1679 int Cnt = 0; 1680 for (const Expr *DRE : Privates) { 1681 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 1682 ++Cnt; 1683 } 1684 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( 1685 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1); 1686 1687 if (TeamsReduction) 1688 TeamsReductions.push_back(ReductionRec); 1689 1690 // Source location for the ident struct 1691 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1692 1693 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; 1694 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(), 1695 CGF.AllocaInsertPt->getIterator()); 1696 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(), 1697 CGF.Builder.GetInsertPoint()); 1698 llvm::OpenMPIRBuilder::LocationDescription OmpLoc( 1699 CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); 1700 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; 1701 1702 CodeGenFunction::OMPPrivateScope Scope(CGF); 1703 unsigned Idx = 0; 1704 for (const Expr *Private : Privates) { 1705 llvm::Type *ElementType; 1706 llvm::Value *Variable; 1707 llvm::Value *PrivateVariable; 1708 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr; 1709 ElementType = CGF.ConvertTypeForMem(Private->getType()); 1710 const auto *RHSVar = 1711 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl()); 1712 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF); 1713 const auto *LHSVar = 1714 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl()); 1715 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF); 1716 llvm::OpenMPIRBuilder::EvalKind EvalKind; 1717 switch (CGF.getEvaluationKind(Private->getType())) { 1718 case TEK_Scalar: 1719 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar; 1720 break; 1721 case TEK_Complex: 1722 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex; 1723 break; 1724 case TEK_Aggregate: 1725 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate; 1726 break; 1727 } 1728 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I, 1729 llvm::Value **LHSPtr, llvm::Value **RHSPtr, 1730 llvm::Function *NewFunc) { 1731 CGF.Builder.restoreIP(CodeGenIP); 1732 auto *CurFn = CGF.CurFn; 1733 CGF.CurFn = NewFunc; 1734 1735 *LHSPtr = CGF.GetAddrOfLocalVar( 1736 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl())) 1737 .emitRawPointer(CGF); 1738 *RHSPtr = CGF.GetAddrOfLocalVar( 1739 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl())) 1740 .emitRawPointer(CGF); 1741 1742 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I], 1743 cast<DeclRefExpr>(LHSExprs[I]), 1744 cast<DeclRefExpr>(RHSExprs[I])); 1745 1746 CGF.CurFn = CurFn; 1747 1748 return InsertPointTy(CGF.Builder.GetInsertBlock(), 1749 CGF.Builder.GetInsertPoint()); 1750 }; 1751 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( 1752 ElementType, Variable, PrivateVariable, EvalKind, 1753 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); 1754 Idx++; 1755 } 1756 1757 CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU( 1758 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, 1759 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, 1760 CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, 1761 RTLoc)); 1762 return; 1763 } 1764 1765 const VarDecl * 1766 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 1767 const VarDecl *NativeParam) const { 1768 if (!NativeParam->getType()->isReferenceType()) 1769 return NativeParam; 1770 QualType ArgType = NativeParam->getType(); 1771 QualifierCollector QC; 1772 const Type *NonQualTy = QC.strip(ArgType); 1773 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1774 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 1775 if (Attr->getCaptureKind() == OMPC_map) { 1776 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 1777 LangAS::opencl_global); 1778 } 1779 } 1780 ArgType = CGM.getContext().getPointerType(PointeeTy); 1781 QC.addRestrict(); 1782 enum { NVPTX_local_addr = 5 }; 1783 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 1784 ArgType = QC.apply(CGM.getContext(), ArgType); 1785 if (isa<ImplicitParamDecl>(NativeParam)) 1786 return ImplicitParamDecl::Create( 1787 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 1788 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); 1789 return ParmVarDecl::Create( 1790 CGM.getContext(), 1791 const_cast<DeclContext *>(NativeParam->getDeclContext()), 1792 NativeParam->getBeginLoc(), NativeParam->getLocation(), 1793 NativeParam->getIdentifier(), ArgType, 1794 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 1795 } 1796 1797 Address 1798 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 1799 const VarDecl *NativeParam, 1800 const VarDecl *TargetParam) const { 1801 assert(NativeParam != TargetParam && 1802 NativeParam->getType()->isReferenceType() && 1803 "Native arg must not be the same as target arg."); 1804 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 1805 QualType NativeParamType = NativeParam->getType(); 1806 QualifierCollector QC; 1807 const Type *NonQualTy = QC.strip(NativeParamType); 1808 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1809 unsigned NativePointeeAddrSpace = 1810 CGF.getTypes().getTargetAddressSpace(NativePointeeTy); 1811 QualType TargetTy = TargetParam->getType(); 1812 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, 1813 TargetTy, SourceLocation()); 1814 // Cast to native address space. 1815 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 1816 TargetAddr, 1817 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); 1818 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 1819 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 1820 NativeParamType); 1821 return NativeParamAddr; 1822 } 1823 1824 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 1825 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 1826 ArrayRef<llvm::Value *> Args) const { 1827 SmallVector<llvm::Value *, 4> TargetArgs; 1828 TargetArgs.reserve(Args.size()); 1829 auto *FnType = OutlinedFn.getFunctionType(); 1830 for (unsigned I = 0, E = Args.size(); I < E; ++I) { 1831 if (FnType->isVarArg() && FnType->getNumParams() <= I) { 1832 TargetArgs.append(std::next(Args.begin(), I), Args.end()); 1833 break; 1834 } 1835 llvm::Type *TargetType = FnType->getParamType(I); 1836 llvm::Value *NativeArg = Args[I]; 1837 if (!TargetType->isPointerTy()) { 1838 TargetArgs.emplace_back(NativeArg); 1839 continue; 1840 } 1841 TargetArgs.emplace_back( 1842 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); 1843 } 1844 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 1845 } 1846 1847 /// Emit function which wraps the outline parallel region 1848 /// and controls the arguments which are passed to this function. 1849 /// The wrapper ensures that the outlined function is called 1850 /// with the correct arguments when data is shared. 1851 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 1852 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 1853 ASTContext &Ctx = CGM.getContext(); 1854 const auto &CS = *D.getCapturedStmt(OMPD_parallel); 1855 1856 // Create a function that takes as argument the source thread. 1857 FunctionArgList WrapperArgs; 1858 QualType Int16QTy = 1859 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 1860 QualType Int32QTy = 1861 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 1862 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1863 /*Id=*/nullptr, Int16QTy, 1864 ImplicitParamKind::Other); 1865 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1866 /*Id=*/nullptr, Int32QTy, 1867 ImplicitParamKind::Other); 1868 WrapperArgs.emplace_back(&ParallelLevelArg); 1869 WrapperArgs.emplace_back(&WrapperArg); 1870 1871 const CGFunctionInfo &CGFI = 1872 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 1873 1874 auto *Fn = llvm::Function::Create( 1875 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 1876 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 1877 1878 // Ensure we do not inline the function. This is trivially true for the ones 1879 // passed to __kmpc_fork_call but the ones calles in serialized regions 1880 // could be inlined. This is not a perfect but it is closer to the invariant 1881 // we want, namely, every data environment starts with a new function. 1882 // TODO: We should pass the if condition to the runtime function and do the 1883 // handling there. Much cleaner code. 1884 Fn->addFnAttr(llvm::Attribute::NoInline); 1885 1886 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 1887 Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 1888 Fn->setDoesNotRecurse(); 1889 1890 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1891 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 1892 D.getBeginLoc(), D.getBeginLoc()); 1893 1894 const auto *RD = CS.getCapturedRecordDecl(); 1895 auto CurField = RD->field_begin(); 1896 1897 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1898 /*Name=*/".zero.addr"); 1899 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1900 // Get the array of arguments. 1901 SmallVector<llvm::Value *, 8> Args; 1902 1903 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF)); 1904 Args.emplace_back(ZeroAddr.emitRawPointer(CGF)); 1905 1906 CGBuilderTy &Bld = CGF.Builder; 1907 auto CI = CS.capture_begin(); 1908 1909 // Use global memory for data sharing. 1910 // Handle passing of global args to workers. 1911 RawAddress GlobalArgs = 1912 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 1913 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 1914 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 1915 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1916 CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 1917 DataSharingArgs); 1918 1919 // Retrieve the shared variables from the list of references returned 1920 // by the runtime. Pass the variables to the outlined function. 1921 Address SharedArgListAddress = Address::invalid(); 1922 if (CS.capture_size() > 0 || 1923 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1924 SharedArgListAddress = CGF.EmitLoadOfPointer( 1925 GlobalArgs, CGF.getContext() 1926 .getPointerType(CGF.getContext().VoidPtrTy) 1927 .castAs<PointerType>()); 1928 } 1929 unsigned Idx = 0; 1930 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1931 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1932 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1933 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 1934 llvm::Value *LB = CGF.EmitLoadOfScalar( 1935 TypedAddress, 1936 /*Volatile=*/false, 1937 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1938 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 1939 Args.emplace_back(LB); 1940 ++Idx; 1941 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1942 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1943 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy); 1944 llvm::Value *UB = CGF.EmitLoadOfScalar( 1945 TypedAddress, 1946 /*Volatile=*/false, 1947 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1948 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 1949 Args.emplace_back(UB); 1950 ++Idx; 1951 } 1952 if (CS.capture_size() > 0) { 1953 ASTContext &CGFContext = CGF.getContext(); 1954 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 1955 QualType ElemTy = CurField->getType(); 1956 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 1957 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1958 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), 1959 CGF.ConvertTypeForMem(ElemTy)); 1960 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 1961 /*Volatile=*/false, 1962 CGFContext.getPointerType(ElemTy), 1963 CI->getLocation()); 1964 if (CI->capturesVariableByCopy() && 1965 !CI->getCapturedVar()->getType()->isAnyPointerType()) { 1966 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 1967 CI->getLocation()); 1968 } 1969 Args.emplace_back(Arg); 1970 } 1971 } 1972 1973 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 1974 CGF.FinishFunction(); 1975 return Fn; 1976 } 1977 1978 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 1979 const Decl *D) { 1980 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1981 return; 1982 1983 assert(D && "Expected function or captured|block decl."); 1984 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 1985 "Function is registered already."); 1986 assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 1987 "Team is set but not processed."); 1988 const Stmt *Body = nullptr; 1989 bool NeedToDelayGlobalization = false; 1990 if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 1991 Body = FD->getBody(); 1992 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 1993 Body = BD->getBody(); 1994 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 1995 Body = CD->getBody(); 1996 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 1997 if (NeedToDelayGlobalization && 1998 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1999 return; 2000 } 2001 if (!Body) 2002 return; 2003 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 2004 VarChecker.Visit(Body); 2005 const RecordDecl *GlobalizedVarsRecord = 2006 VarChecker.getGlobalizedRecord(IsInTTDRegion); 2007 TeamAndReductions.first = nullptr; 2008 TeamAndReductions.second.clear(); 2009 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 2010 VarChecker.getEscapedVariableLengthDecls(); 2011 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = 2012 VarChecker.getDelayedVariableLengthDecls(); 2013 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && 2014 DelayedVariableLengthDecls.empty()) 2015 return; 2016 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 2017 I->getSecond().MappedParams = 2018 std::make_unique<CodeGenFunction::OMPMapVars>(); 2019 I->getSecond().EscapedParameters.insert( 2020 VarChecker.getEscapedParameters().begin(), 2021 VarChecker.getEscapedParameters().end()); 2022 I->getSecond().EscapedVariableLengthDecls.append( 2023 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 2024 I->getSecond().DelayedVariableLengthDecls.append( 2025 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); 2026 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 2027 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 2028 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 2029 Data.insert(std::make_pair(VD, MappedVarData())); 2030 } 2031 if (!NeedToDelayGlobalization) { 2032 emitGenericVarsProlog(CGF, D->getBeginLoc()); 2033 struct GlobalizationScope final : EHScopeStack::Cleanup { 2034 GlobalizationScope() = default; 2035 2036 void Emit(CodeGenFunction &CGF, Flags flags) override { 2037 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 2038 .emitGenericVarsEpilog(CGF); 2039 } 2040 }; 2041 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 2042 } 2043 } 2044 2045 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 2046 const VarDecl *VD) { 2047 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 2048 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2049 auto AS = LangAS::Default; 2050 switch (A->getAllocatorType()) { 2051 // Use the default allocator here as by default local vars are 2052 // threadlocal. 2053 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2054 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2056 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2057 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2058 // Follow the user decision - use default allocation. 2059 return Address::invalid(); 2060 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2061 // TODO: implement aupport for user-defined allocators. 2062 return Address::invalid(); 2063 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2064 AS = LangAS::cuda_constant; 2065 break; 2066 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2067 AS = LangAS::cuda_shared; 2068 break; 2069 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2070 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2071 break; 2072 } 2073 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 2074 auto *GV = new llvm::GlobalVariable( 2075 CGM.getModule(), VarTy, /*isConstant=*/false, 2076 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), 2077 VD->getName(), 2078 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 2079 CGM.getContext().getTargetAddressSpace(AS)); 2080 CharUnits Align = CGM.getContext().getDeclAlign(VD); 2081 GV->setAlignment(Align.getAsAlign()); 2082 return Address( 2083 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2084 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace( 2085 VD->getType().getAddressSpace()))), 2086 VarTy, Align); 2087 } 2088 2089 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 2090 return Address::invalid(); 2091 2092 VD = VD->getCanonicalDecl(); 2093 auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2094 if (I == FunctionGlobalizedDecls.end()) 2095 return Address::invalid(); 2096 auto VDI = I->getSecond().LocalVarData.find(VD); 2097 if (VDI != I->getSecond().LocalVarData.end()) 2098 return VDI->second.PrivateAddr; 2099 if (VD->hasAttrs()) { 2100 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 2101 E(VD->attr_end()); 2102 IT != E; ++IT) { 2103 auto VDI = I->getSecond().LocalVarData.find( 2104 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 2105 ->getCanonicalDecl()); 2106 if (VDI != I->getSecond().LocalVarData.end()) 2107 return VDI->second.PrivateAddr; 2108 } 2109 } 2110 2111 return Address::invalid(); 2112 } 2113 2114 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 2115 FunctionGlobalizedDecls.erase(CGF.CurFn); 2116 CGOpenMPRuntime::functionFinished(CGF); 2117 } 2118 2119 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 2120 CodeGenFunction &CGF, const OMPLoopDirective &S, 2121 OpenMPDistScheduleClauseKind &ScheduleKind, 2122 llvm::Value *&Chunk) const { 2123 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2124 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 2125 ScheduleKind = OMPC_DIST_SCHEDULE_static; 2126 Chunk = CGF.EmitScalarConversion( 2127 RT.getGPUNumThreads(CGF), 2128 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2129 S.getIterationVariable()->getType(), S.getBeginLoc()); 2130 return; 2131 } 2132 CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 2133 CGF, S, ScheduleKind, Chunk); 2134 } 2135 2136 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 2137 CodeGenFunction &CGF, const OMPLoopDirective &S, 2138 OpenMPScheduleClauseKind &ScheduleKind, 2139 const Expr *&ChunkExpr) const { 2140 ScheduleKind = OMPC_SCHEDULE_static; 2141 // Chunk size is 1 in this case. 2142 llvm::APInt ChunkSize(32, 1); 2143 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 2144 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2145 SourceLocation()); 2146 } 2147 2148 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 2149 CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 2150 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 2151 " Expected target-based directive."); 2152 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 2153 for (const CapturedStmt::Capture &C : CS->captures()) { 2154 // Capture variables captured by reference in lambdas for target-based 2155 // directives. 2156 if (!C.capturesVariable()) 2157 continue; 2158 const VarDecl *VD = C.getCapturedVar(); 2159 const auto *RD = VD->getType() 2160 .getCanonicalType() 2161 .getNonReferenceType() 2162 ->getAsCXXRecordDecl(); 2163 if (!RD || !RD->isLambda()) 2164 continue; 2165 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 2166 LValue VDLVal; 2167 if (VD->getType().getCanonicalType()->isReferenceType()) 2168 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 2169 else 2170 VDLVal = CGF.MakeAddrLValue( 2171 VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 2172 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; 2173 FieldDecl *ThisCapture = nullptr; 2174 RD->getCaptureFields(Captures, ThisCapture); 2175 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 2176 LValue ThisLVal = 2177 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 2178 llvm::Value *CXXThis = CGF.LoadCXXThis(); 2179 CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 2180 } 2181 for (const LambdaCapture &LC : RD->captures()) { 2182 if (LC.getCaptureKind() != LCK_ByRef) 2183 continue; 2184 const ValueDecl *VD = LC.getCapturedVar(); 2185 // FIXME: For now VD is always a VarDecl because OpenMP does not support 2186 // capturing structured bindings in lambdas yet. 2187 if (!CS->capturesVariable(cast<VarDecl>(VD))) 2188 continue; 2189 auto It = Captures.find(VD); 2190 assert(It != Captures.end() && "Found lambda capture without field."); 2191 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 2192 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); 2193 if (VD->getType().getCanonicalType()->isReferenceType()) 2194 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 2195 VD->getType().getCanonicalType()) 2196 .getAddress(); 2197 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal); 2198 } 2199 } 2200 } 2201 2202 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 2203 LangAS &AS) { 2204 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 2205 return false; 2206 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2207 switch(A->getAllocatorType()) { 2208 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2209 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2210 // Not supported, fallback to the default mem space. 2211 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2212 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2213 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2214 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2215 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2216 AS = LangAS::Default; 2217 return true; 2218 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2219 AS = LangAS::cuda_constant; 2220 return true; 2221 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2222 AS = LangAS::cuda_shared; 2223 return true; 2224 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2225 llvm_unreachable("Expected predefined allocator for the variables with the " 2226 "static storage."); 2227 } 2228 return false; 2229 } 2230 2231 // Get current OffloadArch and ignore any unknown values 2232 static OffloadArch getOffloadArch(CodeGenModule &CGM) { 2233 if (!CGM.getTarget().hasFeature("ptx")) 2234 return OffloadArch::UNKNOWN; 2235 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 2236 if (Feature.getValue()) { 2237 OffloadArch Arch = StringToOffloadArch(Feature.getKey()); 2238 if (Arch != OffloadArch::UNKNOWN) 2239 return Arch; 2240 } 2241 } 2242 return OffloadArch::UNKNOWN; 2243 } 2244 2245 /// Check to see if target architecture supports unified addressing which is 2246 /// a restriction for OpenMP requires clause "unified_shared_memory". 2247 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { 2248 for (const OMPClause *Clause : D->clauselists()) { 2249 if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 2250 OffloadArch Arch = getOffloadArch(CGM); 2251 switch (Arch) { 2252 case OffloadArch::SM_20: 2253 case OffloadArch::SM_21: 2254 case OffloadArch::SM_30: 2255 case OffloadArch::SM_32_: 2256 case OffloadArch::SM_35: 2257 case OffloadArch::SM_37: 2258 case OffloadArch::SM_50: 2259 case OffloadArch::SM_52: 2260 case OffloadArch::SM_53: { 2261 SmallString<256> Buffer; 2262 llvm::raw_svector_ostream Out(Buffer); 2263 Out << "Target architecture " << OffloadArchToString(Arch) 2264 << " does not support unified addressing"; 2265 CGM.Error(Clause->getBeginLoc(), Out.str()); 2266 return; 2267 } 2268 case OffloadArch::SM_60: 2269 case OffloadArch::SM_61: 2270 case OffloadArch::SM_62: 2271 case OffloadArch::SM_70: 2272 case OffloadArch::SM_72: 2273 case OffloadArch::SM_75: 2274 case OffloadArch::SM_80: 2275 case OffloadArch::SM_86: 2276 case OffloadArch::SM_87: 2277 case OffloadArch::SM_89: 2278 case OffloadArch::SM_90: 2279 case OffloadArch::SM_90a: 2280 case OffloadArch::GFX600: 2281 case OffloadArch::GFX601: 2282 case OffloadArch::GFX602: 2283 case OffloadArch::GFX700: 2284 case OffloadArch::GFX701: 2285 case OffloadArch::GFX702: 2286 case OffloadArch::GFX703: 2287 case OffloadArch::GFX704: 2288 case OffloadArch::GFX705: 2289 case OffloadArch::GFX801: 2290 case OffloadArch::GFX802: 2291 case OffloadArch::GFX803: 2292 case OffloadArch::GFX805: 2293 case OffloadArch::GFX810: 2294 case OffloadArch::GFX9_GENERIC: 2295 case OffloadArch::GFX900: 2296 case OffloadArch::GFX902: 2297 case OffloadArch::GFX904: 2298 case OffloadArch::GFX906: 2299 case OffloadArch::GFX908: 2300 case OffloadArch::GFX909: 2301 case OffloadArch::GFX90a: 2302 case OffloadArch::GFX90c: 2303 case OffloadArch::GFX940: 2304 case OffloadArch::GFX941: 2305 case OffloadArch::GFX942: 2306 case OffloadArch::GFX10_1_GENERIC: 2307 case OffloadArch::GFX1010: 2308 case OffloadArch::GFX1011: 2309 case OffloadArch::GFX1012: 2310 case OffloadArch::GFX1013: 2311 case OffloadArch::GFX10_3_GENERIC: 2312 case OffloadArch::GFX1030: 2313 case OffloadArch::GFX1031: 2314 case OffloadArch::GFX1032: 2315 case OffloadArch::GFX1033: 2316 case OffloadArch::GFX1034: 2317 case OffloadArch::GFX1035: 2318 case OffloadArch::GFX1036: 2319 case OffloadArch::GFX11_GENERIC: 2320 case OffloadArch::GFX1100: 2321 case OffloadArch::GFX1101: 2322 case OffloadArch::GFX1102: 2323 case OffloadArch::GFX1103: 2324 case OffloadArch::GFX1150: 2325 case OffloadArch::GFX1151: 2326 case OffloadArch::GFX1152: 2327 case OffloadArch::GFX12_GENERIC: 2328 case OffloadArch::GFX1200: 2329 case OffloadArch::GFX1201: 2330 case OffloadArch::AMDGCNSPIRV: 2331 case OffloadArch::Generic: 2332 case OffloadArch::UNUSED: 2333 case OffloadArch::UNKNOWN: 2334 break; 2335 case OffloadArch::LAST: 2336 llvm_unreachable("Unexpected GPU arch."); 2337 } 2338 } 2339 } 2340 CGOpenMPRuntime::processRequiresDirective(D); 2341 } 2342 2343 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { 2344 CGBuilderTy &Bld = CGF.Builder; 2345 llvm::Module *M = &CGF.CGM.getModule(); 2346 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; 2347 llvm::Function *F = M->getFunction(LocSize); 2348 if (!F) { 2349 F = llvm::Function::Create( 2350 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false), 2351 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule()); 2352 } 2353 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads"); 2354 } 2355 2356 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { 2357 ArrayRef<llvm::Value *> Args{}; 2358 return CGF.EmitRuntimeCall( 2359 OMPBuilder.getOrCreateRuntimeFunction( 2360 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), 2361 Args); 2362 } 2363