1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This provides a generalized class for OpenMP runtime code generation 10 // specialized by GPU targets NVPTX and AMDGCN. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "CGOpenMPRuntimeGPU.h" 15 #include "CGDebugInfo.h" 16 #include "CodeGenFunction.h" 17 #include "clang/AST/Attr.h" 18 #include "clang/AST/DeclOpenMP.h" 19 #include "clang/AST/OpenMPClause.h" 20 #include "clang/AST/StmtOpenMP.h" 21 #include "clang/AST/StmtVisitor.h" 22 #include "clang/Basic/Cuda.h" 23 #include "llvm/ADT/SmallPtrSet.h" 24 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" 25 #include "llvm/Frontend/OpenMP/OMPGridValues.h" 26 27 using namespace clang; 28 using namespace CodeGen; 29 using namespace llvm::omp; 30 31 namespace { 32 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. 33 class NVPTXActionTy final : public PrePostActionTy { 34 llvm::FunctionCallee EnterCallee = nullptr; 35 ArrayRef<llvm::Value *> EnterArgs; 36 llvm::FunctionCallee ExitCallee = nullptr; 37 ArrayRef<llvm::Value *> ExitArgs; 38 bool Conditional = false; 39 llvm::BasicBlock *ContBlock = nullptr; 40 41 public: 42 NVPTXActionTy(llvm::FunctionCallee EnterCallee, 43 ArrayRef<llvm::Value *> EnterArgs, 44 llvm::FunctionCallee ExitCallee, 45 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false) 46 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), 47 ExitArgs(ExitArgs), Conditional(Conditional) {} 48 void Enter(CodeGenFunction &CGF) override { 49 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); 50 if (Conditional) { 51 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); 52 auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); 53 ContBlock = CGF.createBasicBlock("omp_if.end"); 54 // Generate the branch (If-stmt) 55 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); 56 CGF.EmitBlock(ThenBlock); 57 } 58 } 59 void Done(CodeGenFunction &CGF) { 60 // Emit the rest of blocks/branches 61 CGF.EmitBranch(ContBlock); 62 CGF.EmitBlock(ContBlock, true); 63 } 64 void Exit(CodeGenFunction &CGF) override { 65 CGF.EmitRuntimeCall(ExitCallee, ExitArgs); 66 } 67 }; 68 69 /// A class to track the execution mode when codegening directives within 70 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry 71 /// to the target region and used by containing directives such as 'parallel' 72 /// to emit optimized code. 73 class ExecutionRuntimeModesRAII { 74 private: 75 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode = 76 CGOpenMPRuntimeGPU::EM_Unknown; 77 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode; 78 79 public: 80 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode, 81 CGOpenMPRuntimeGPU::ExecutionMode EntryMode) 82 : ExecMode(ExecMode) { 83 SavedExecMode = ExecMode; 84 ExecMode = EntryMode; 85 } 86 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; } 87 }; 88 89 static const ValueDecl *getPrivateItem(const Expr *RefExpr) { 90 RefExpr = RefExpr->IgnoreParens(); 91 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) { 92 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts(); 93 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 94 Base = TempASE->getBase()->IgnoreParenImpCasts(); 95 RefExpr = Base; 96 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) { 97 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts(); 98 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base)) 99 Base = TempOASE->getBase()->IgnoreParenImpCasts(); 100 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base)) 101 Base = TempASE->getBase()->IgnoreParenImpCasts(); 102 RefExpr = Base; 103 } 104 RefExpr = RefExpr->IgnoreParenImpCasts(); 105 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr)) 106 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()); 107 const auto *ME = cast<MemberExpr>(RefExpr); 108 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl()); 109 } 110 111 static RecordDecl *buildRecordForGlobalizedVars( 112 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls, 113 ArrayRef<const ValueDecl *> EscapedDeclsForTeams, 114 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 115 &MappedDeclsFields, 116 int BufSize) { 117 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>; 118 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) 119 return nullptr; 120 SmallVector<VarsDataTy, 4> GlobalizedVars; 121 for (const ValueDecl *D : EscapedDecls) 122 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 123 for (const ValueDecl *D : EscapedDeclsForTeams) 124 GlobalizedVars.emplace_back(C.getDeclAlign(D), D); 125 126 // Build struct _globalized_locals_ty { 127 // /* globalized vars */[WarSize] align (decl_align) 128 // /* globalized vars */ for EscapedDeclsForTeams 129 // }; 130 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); 131 GlobalizedRD->startDefinition(); 132 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(llvm::from_range, 133 EscapedDeclsForTeams); 134 for (const auto &Pair : GlobalizedVars) { 135 const ValueDecl *VD = Pair.second; 136 QualType Type = VD->getType(); 137 if (Type->isLValueReferenceType()) 138 Type = C.getPointerType(Type.getNonReferenceType()); 139 else 140 Type = Type.getNonReferenceType(); 141 SourceLocation Loc = VD->getLocation(); 142 FieldDecl *Field; 143 if (SingleEscaped.count(VD)) { 144 Field = FieldDecl::Create( 145 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 146 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 147 /*BW=*/nullptr, /*Mutable=*/false, 148 /*InitStyle=*/ICIS_NoInit); 149 Field->setAccess(AS_public); 150 if (VD->hasAttrs()) { 151 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()), 152 E(VD->getAttrs().end()); 153 I != E; ++I) 154 Field->addAttr(*I); 155 } 156 } else { 157 if (BufSize > 1) { 158 llvm::APInt ArraySize(32, BufSize); 159 Type = C.getConstantArrayType(Type, ArraySize, nullptr, 160 ArraySizeModifier::Normal, 0); 161 } 162 Field = FieldDecl::Create( 163 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, 164 C.getTrivialTypeSourceInfo(Type, SourceLocation()), 165 /*BW=*/nullptr, /*Mutable=*/false, 166 /*InitStyle=*/ICIS_NoInit); 167 Field->setAccess(AS_public); 168 llvm::APInt Align(32, Pair.first.getQuantity()); 169 Field->addAttr(AlignedAttr::CreateImplicit( 170 C, /*IsAlignmentExpr=*/true, 171 IntegerLiteral::Create(C, Align, 172 C.getIntTypeForBitwidth(32, /*Signed=*/0), 173 SourceLocation()), 174 {}, AlignedAttr::GNU_aligned)); 175 } 176 GlobalizedRD->addDecl(Field); 177 MappedDeclsFields.try_emplace(VD, Field); 178 } 179 GlobalizedRD->completeDefinition(); 180 return GlobalizedRD; 181 } 182 183 /// Get the list of variables that can escape their declaration context. 184 class CheckVarsEscapingDeclContext final 185 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> { 186 CodeGenFunction &CGF; 187 llvm::SetVector<const ValueDecl *> EscapedDecls; 188 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls; 189 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls; 190 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters; 191 RecordDecl *GlobalizedRD = nullptr; 192 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 193 bool AllEscaped = false; 194 bool IsForCombinedParallelRegion = false; 195 196 void markAsEscaped(const ValueDecl *VD) { 197 // Do not globalize declare target variables. 198 if (!isa<VarDecl>(VD) || 199 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) 200 return; 201 VD = cast<ValueDecl>(VD->getCanonicalDecl()); 202 // Use user-specified allocation. 203 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>()) 204 return; 205 // Variables captured by value must be globalized. 206 bool IsCaptured = false; 207 if (auto *CSI = CGF.CapturedStmtInfo) { 208 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) { 209 // Check if need to capture the variable that was already captured by 210 // value in the outer region. 211 IsCaptured = true; 212 if (!IsForCombinedParallelRegion) { 213 if (!FD->hasAttrs()) 214 return; 215 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>(); 216 if (!Attr) 217 return; 218 if (((Attr->getCaptureKind() != OMPC_map) && 219 !isOpenMPPrivate(Attr->getCaptureKind())) || 220 ((Attr->getCaptureKind() == OMPC_map) && 221 !FD->getType()->isAnyPointerType())) 222 return; 223 } 224 if (!FD->getType()->isReferenceType()) { 225 assert(!VD->getType()->isVariablyModifiedType() && 226 "Parameter captured by value with variably modified type"); 227 EscapedParameters.insert(VD); 228 } else if (!IsForCombinedParallelRegion) { 229 return; 230 } 231 } 232 } 233 if ((!CGF.CapturedStmtInfo || 234 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) && 235 VD->getType()->isReferenceType()) 236 // Do not globalize variables with reference type. 237 return; 238 if (VD->getType()->isVariablyModifiedType()) { 239 // If not captured at the target region level then mark the escaped 240 // variable as delayed. 241 if (IsCaptured) 242 EscapedVariableLengthDecls.insert(VD); 243 else 244 DelayedVariableLengthDecls.insert(VD); 245 } else 246 EscapedDecls.insert(VD); 247 } 248 249 void VisitValueDecl(const ValueDecl *VD) { 250 if (VD->getType()->isLValueReferenceType()) 251 markAsEscaped(VD); 252 if (const auto *VarD = dyn_cast<VarDecl>(VD)) { 253 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) { 254 const bool SavedAllEscaped = AllEscaped; 255 AllEscaped = VD->getType()->isLValueReferenceType(); 256 Visit(VarD->getInit()); 257 AllEscaped = SavedAllEscaped; 258 } 259 } 260 } 261 void VisitOpenMPCapturedStmt(const CapturedStmt *S, 262 ArrayRef<OMPClause *> Clauses, 263 bool IsCombinedParallelRegion) { 264 if (!S) 265 return; 266 for (const CapturedStmt::Capture &C : S->captures()) { 267 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 268 const ValueDecl *VD = C.getCapturedVar(); 269 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion; 270 if (IsCombinedParallelRegion) { 271 // Check if the variable is privatized in the combined construct and 272 // those private copies must be shared in the inner parallel 273 // directive. 274 IsForCombinedParallelRegion = false; 275 for (const OMPClause *C : Clauses) { 276 if (!isOpenMPPrivate(C->getClauseKind()) || 277 C->getClauseKind() == OMPC_reduction || 278 C->getClauseKind() == OMPC_linear || 279 C->getClauseKind() == OMPC_private) 280 continue; 281 ArrayRef<const Expr *> Vars; 282 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C)) 283 Vars = PC->getVarRefs(); 284 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C)) 285 Vars = PC->getVarRefs(); 286 else 287 llvm_unreachable("Unexpected clause."); 288 for (const auto *E : Vars) { 289 const Decl *D = 290 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl(); 291 if (D == VD->getCanonicalDecl()) { 292 IsForCombinedParallelRegion = true; 293 break; 294 } 295 } 296 if (IsForCombinedParallelRegion) 297 break; 298 } 299 } 300 markAsEscaped(VD); 301 if (isa<OMPCapturedExprDecl>(VD)) 302 VisitValueDecl(VD); 303 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion; 304 } 305 } 306 } 307 308 void buildRecordForGlobalizedVars(bool IsInTTDRegion) { 309 assert(!GlobalizedRD && 310 "Record for globalized variables is built already."); 311 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams; 312 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; 313 if (IsInTTDRegion) 314 EscapedDeclsForTeams = EscapedDecls.getArrayRef(); 315 else 316 EscapedDeclsForParallel = EscapedDecls.getArrayRef(); 317 GlobalizedRD = ::buildRecordForGlobalizedVars( 318 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, 319 MappedDeclsFields, WarpSize); 320 } 321 322 public: 323 CheckVarsEscapingDeclContext(CodeGenFunction &CGF, 324 ArrayRef<const ValueDecl *> TeamsReductions) 325 : CGF(CGF), EscapedDecls(llvm::from_range, TeamsReductions) {} 326 ~CheckVarsEscapingDeclContext() = default; 327 void VisitDeclStmt(const DeclStmt *S) { 328 if (!S) 329 return; 330 for (const Decl *D : S->decls()) 331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D)) 332 VisitValueDecl(VD); 333 } 334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) { 335 if (!D) 336 return; 337 if (!D->hasAssociatedStmt()) 338 return; 339 if (const auto *S = 340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) { 341 // Do not analyze directives that do not actually require capturing, 342 // like `omp for` or `omp simd` directives. 343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions; 344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind()); 345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) { 346 VisitStmt(S->getCapturedStmt()); 347 return; 348 } 349 VisitOpenMPCapturedStmt( 350 S, D->clauses(), 351 CaptureRegions.back() == OMPD_parallel && 352 isOpenMPDistributeDirective(D->getDirectiveKind())); 353 } 354 } 355 void VisitCapturedStmt(const CapturedStmt *S) { 356 if (!S) 357 return; 358 for (const CapturedStmt::Capture &C : S->captures()) { 359 if (C.capturesVariable() && !C.capturesVariableByCopy()) { 360 const ValueDecl *VD = C.getCapturedVar(); 361 markAsEscaped(VD); 362 if (isa<OMPCapturedExprDecl>(VD)) 363 VisitValueDecl(VD); 364 } 365 } 366 } 367 void VisitLambdaExpr(const LambdaExpr *E) { 368 if (!E) 369 return; 370 for (const LambdaCapture &C : E->captures()) { 371 if (C.capturesVariable()) { 372 if (C.getCaptureKind() == LCK_ByRef) { 373 const ValueDecl *VD = C.getCapturedVar(); 374 markAsEscaped(VD); 375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD)) 376 VisitValueDecl(VD); 377 } 378 } 379 } 380 } 381 void VisitBlockExpr(const BlockExpr *E) { 382 if (!E) 383 return; 384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) { 385 if (C.isByRef()) { 386 const VarDecl *VD = C.getVariable(); 387 markAsEscaped(VD); 388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture()) 389 VisitValueDecl(VD); 390 } 391 } 392 } 393 void VisitCallExpr(const CallExpr *E) { 394 if (!E) 395 return; 396 for (const Expr *Arg : E->arguments()) { 397 if (!Arg) 398 continue; 399 if (Arg->isLValue()) { 400 const bool SavedAllEscaped = AllEscaped; 401 AllEscaped = true; 402 Visit(Arg); 403 AllEscaped = SavedAllEscaped; 404 } else { 405 Visit(Arg); 406 } 407 } 408 Visit(E->getCallee()); 409 } 410 void VisitDeclRefExpr(const DeclRefExpr *E) { 411 if (!E) 412 return; 413 const ValueDecl *VD = E->getDecl(); 414 if (AllEscaped) 415 markAsEscaped(VD); 416 if (isa<OMPCapturedExprDecl>(VD)) 417 VisitValueDecl(VD); 418 else if (VD->isInitCapture()) 419 VisitValueDecl(VD); 420 } 421 void VisitUnaryOperator(const UnaryOperator *E) { 422 if (!E) 423 return; 424 if (E->getOpcode() == UO_AddrOf) { 425 const bool SavedAllEscaped = AllEscaped; 426 AllEscaped = true; 427 Visit(E->getSubExpr()); 428 AllEscaped = SavedAllEscaped; 429 } else { 430 Visit(E->getSubExpr()); 431 } 432 } 433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) { 434 if (!E) 435 return; 436 if (E->getCastKind() == CK_ArrayToPointerDecay) { 437 const bool SavedAllEscaped = AllEscaped; 438 AllEscaped = true; 439 Visit(E->getSubExpr()); 440 AllEscaped = SavedAllEscaped; 441 } else { 442 Visit(E->getSubExpr()); 443 } 444 } 445 void VisitExpr(const Expr *E) { 446 if (!E) 447 return; 448 bool SavedAllEscaped = AllEscaped; 449 if (!E->isLValue()) 450 AllEscaped = false; 451 for (const Stmt *Child : E->children()) 452 if (Child) 453 Visit(Child); 454 AllEscaped = SavedAllEscaped; 455 } 456 void VisitStmt(const Stmt *S) { 457 if (!S) 458 return; 459 for (const Stmt *Child : S->children()) 460 if (Child) 461 Visit(Child); 462 } 463 464 /// Returns the record that handles all the escaped local variables and used 465 /// instead of their original storage. 466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { 467 if (!GlobalizedRD) 468 buildRecordForGlobalizedVars(IsInTTDRegion); 469 return GlobalizedRD; 470 } 471 472 /// Returns the field in the globalized record for the escaped variable. 473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const { 474 assert(GlobalizedRD && 475 "Record for globalized variables must be generated already."); 476 return MappedDeclsFields.lookup(VD); 477 } 478 479 /// Returns the list of the escaped local variables/parameters. 480 ArrayRef<const ValueDecl *> getEscapedDecls() const { 481 return EscapedDecls.getArrayRef(); 482 } 483 484 /// Checks if the escaped local variable is actually a parameter passed by 485 /// value. 486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const { 487 return EscapedParameters; 488 } 489 490 /// Returns the list of the escaped variables with the variably modified 491 /// types. 492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const { 493 return EscapedVariableLengthDecls.getArrayRef(); 494 } 495 496 /// Returns the list of the delayed variables with the variably modified 497 /// types. 498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const { 499 return DelayedVariableLengthDecls.getArrayRef(); 500 } 501 }; 502 } // anonymous namespace 503 504 CGOpenMPRuntimeGPU::ExecutionMode 505 CGOpenMPRuntimeGPU::getExecutionMode() const { 506 return CurrentExecutionMode; 507 } 508 509 CGOpenMPRuntimeGPU::DataSharingMode 510 CGOpenMPRuntimeGPU::getDataSharingMode() const { 511 return CurrentDataSharingMode; 512 } 513 514 /// Check for inner (nested) SPMD construct, if any 515 static bool hasNestedSPMDDirective(ASTContext &Ctx, 516 const OMPExecutableDirective &D) { 517 const auto *CS = D.getInnermostCapturedStmt(); 518 const auto *Body = 519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); 520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 521 522 if (const auto *NestedDir = 523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); 525 switch (D.getDirectiveKind()) { 526 case OMPD_target: 527 if (isOpenMPParallelDirective(DKind)) 528 return true; 529 if (DKind == OMPD_teams) { 530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( 531 /*IgnoreCaptured=*/true); 532 if (!Body) 533 return false; 534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body); 535 if (const auto *NND = 536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) { 537 DKind = NND->getDirectiveKind(); 538 if (isOpenMPParallelDirective(DKind)) 539 return true; 540 } 541 } 542 return false; 543 case OMPD_target_teams: 544 return isOpenMPParallelDirective(DKind); 545 case OMPD_target_simd: 546 case OMPD_target_parallel: 547 case OMPD_target_parallel_for: 548 case OMPD_target_parallel_for_simd: 549 case OMPD_target_teams_distribute: 550 case OMPD_target_teams_distribute_simd: 551 case OMPD_target_teams_distribute_parallel_for: 552 case OMPD_target_teams_distribute_parallel_for_simd: 553 case OMPD_parallel: 554 case OMPD_for: 555 case OMPD_parallel_for: 556 case OMPD_parallel_master: 557 case OMPD_parallel_sections: 558 case OMPD_for_simd: 559 case OMPD_parallel_for_simd: 560 case OMPD_cancel: 561 case OMPD_cancellation_point: 562 case OMPD_ordered: 563 case OMPD_threadprivate: 564 case OMPD_allocate: 565 case OMPD_task: 566 case OMPD_simd: 567 case OMPD_sections: 568 case OMPD_section: 569 case OMPD_single: 570 case OMPD_master: 571 case OMPD_critical: 572 case OMPD_taskyield: 573 case OMPD_barrier: 574 case OMPD_taskwait: 575 case OMPD_taskgroup: 576 case OMPD_atomic: 577 case OMPD_flush: 578 case OMPD_depobj: 579 case OMPD_scan: 580 case OMPD_teams: 581 case OMPD_target_data: 582 case OMPD_target_exit_data: 583 case OMPD_target_enter_data: 584 case OMPD_distribute: 585 case OMPD_distribute_simd: 586 case OMPD_distribute_parallel_for: 587 case OMPD_distribute_parallel_for_simd: 588 case OMPD_teams_distribute: 589 case OMPD_teams_distribute_simd: 590 case OMPD_teams_distribute_parallel_for: 591 case OMPD_teams_distribute_parallel_for_simd: 592 case OMPD_target_update: 593 case OMPD_declare_simd: 594 case OMPD_declare_variant: 595 case OMPD_begin_declare_variant: 596 case OMPD_end_declare_variant: 597 case OMPD_declare_target: 598 case OMPD_end_declare_target: 599 case OMPD_declare_reduction: 600 case OMPD_declare_mapper: 601 case OMPD_taskloop: 602 case OMPD_taskloop_simd: 603 case OMPD_master_taskloop: 604 case OMPD_master_taskloop_simd: 605 case OMPD_parallel_master_taskloop: 606 case OMPD_parallel_master_taskloop_simd: 607 case OMPD_requires: 608 case OMPD_unknown: 609 default: 610 llvm_unreachable("Unexpected directive."); 611 } 612 } 613 614 return false; 615 } 616 617 static bool supportsSPMDExecutionMode(ASTContext &Ctx, 618 const OMPExecutableDirective &D) { 619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); 620 switch (DirectiveKind) { 621 case OMPD_target: 622 case OMPD_target_teams: 623 return hasNestedSPMDDirective(Ctx, D); 624 case OMPD_target_parallel_loop: 625 case OMPD_target_parallel: 626 case OMPD_target_parallel_for: 627 case OMPD_target_parallel_for_simd: 628 case OMPD_target_teams_distribute_parallel_for: 629 case OMPD_target_teams_distribute_parallel_for_simd: 630 case OMPD_target_simd: 631 case OMPD_target_teams_distribute_simd: 632 return true; 633 case OMPD_target_teams_distribute: 634 return false; 635 case OMPD_target_teams_loop: 636 // Whether this is true or not depends on how the directive will 637 // eventually be emitted. 638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D)) 639 return TTLD->canBeParallelFor(); 640 return false; 641 case OMPD_parallel: 642 case OMPD_for: 643 case OMPD_parallel_for: 644 case OMPD_parallel_master: 645 case OMPD_parallel_sections: 646 case OMPD_for_simd: 647 case OMPD_parallel_for_simd: 648 case OMPD_cancel: 649 case OMPD_cancellation_point: 650 case OMPD_ordered: 651 case OMPD_threadprivate: 652 case OMPD_allocate: 653 case OMPD_task: 654 case OMPD_simd: 655 case OMPD_sections: 656 case OMPD_section: 657 case OMPD_single: 658 case OMPD_master: 659 case OMPD_critical: 660 case OMPD_taskyield: 661 case OMPD_barrier: 662 case OMPD_taskwait: 663 case OMPD_taskgroup: 664 case OMPD_atomic: 665 case OMPD_flush: 666 case OMPD_depobj: 667 case OMPD_scan: 668 case OMPD_teams: 669 case OMPD_target_data: 670 case OMPD_target_exit_data: 671 case OMPD_target_enter_data: 672 case OMPD_distribute: 673 case OMPD_distribute_simd: 674 case OMPD_distribute_parallel_for: 675 case OMPD_distribute_parallel_for_simd: 676 case OMPD_teams_distribute: 677 case OMPD_teams_distribute_simd: 678 case OMPD_teams_distribute_parallel_for: 679 case OMPD_teams_distribute_parallel_for_simd: 680 case OMPD_target_update: 681 case OMPD_declare_simd: 682 case OMPD_declare_variant: 683 case OMPD_begin_declare_variant: 684 case OMPD_end_declare_variant: 685 case OMPD_declare_target: 686 case OMPD_end_declare_target: 687 case OMPD_declare_reduction: 688 case OMPD_declare_mapper: 689 case OMPD_taskloop: 690 case OMPD_taskloop_simd: 691 case OMPD_master_taskloop: 692 case OMPD_master_taskloop_simd: 693 case OMPD_parallel_master_taskloop: 694 case OMPD_parallel_master_taskloop_simd: 695 case OMPD_requires: 696 case OMPD_unknown: 697 default: 698 break; 699 } 700 llvm_unreachable( 701 "Unknown programming model for OpenMP directive on NVPTX target."); 702 } 703 704 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, 705 StringRef ParentName, 706 llvm::Function *&OutlinedFn, 707 llvm::Constant *&OutlinedFnID, 708 bool IsOffloadEntry, 709 const RegionCodeGenTy &CodeGen) { 710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD); 711 EntryFunctionState EST; 712 WrapperFunctionsMap.clear(); 713 714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 715 assert(!IsBareKernel && "bare kernel should not be at generic mode"); 716 717 // Emit target region as a standalone region. 718 class NVPTXPrePostActionTy : public PrePostActionTy { 719 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 720 const OMPExecutableDirective &D; 721 722 public: 723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST, 724 const OMPExecutableDirective &D) 725 : EST(EST), D(D) {} 726 void Enter(CodeGenFunction &CGF) override { 727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false); 729 // Skip target region initialization. 730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 731 } 732 void Exit(CodeGenFunction &CGF) override { 733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 734 RT.clearLocThreadIdInsertPt(CGF); 735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false); 736 } 737 } Action(EST, D); 738 CodeGen.setAction(Action); 739 IsInTTDRegion = true; 740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 741 IsOffloadEntry, CodeGen); 742 IsInTTDRegion = false; 743 } 744 745 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D, 746 CodeGenFunction &CGF, 747 EntryFunctionState &EST, bool IsSPMD) { 748 llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs Attrs; 749 Attrs.ExecFlags = 750 IsSPMD ? llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD 751 : llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; 752 computeMinAndMaxThreadsAndTeams(D, CGF, Attrs); 753 754 CGBuilderTy &Bld = CGF.Builder; 755 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, Attrs)); 756 if (!IsSPMD) 757 emitGenericVarsProlog(CGF, EST.Loc); 758 } 759 760 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF, 761 EntryFunctionState &EST, 762 bool IsSPMD) { 763 if (!IsSPMD) 764 emitGenericVarsEpilog(CGF); 765 766 // This is temporary until we remove the fixed sized buffer. 767 ASTContext &C = CGM.getContext(); 768 RecordDecl *StaticRD = C.buildImplicitRecord( 769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union); 770 StaticRD->startDefinition(); 771 for (const RecordDecl *TeamReductionRec : TeamsReductions) { 772 QualType RecTy = C.getRecordType(TeamReductionRec); 773 auto *Field = FieldDecl::Create( 774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, 775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), 776 /*BW=*/nullptr, /*Mutable=*/false, 777 /*InitStyle=*/ICIS_NoInit); 778 Field->setAccess(AS_public); 779 StaticRD->addDecl(Field); 780 } 781 StaticRD->completeDefinition(); 782 QualType StaticTy = C.getRecordType(StaticRD); 783 llvm::Type *LLVMReductionsBufferTy = 784 CGM.getTypes().ConvertTypeForMem(StaticTy); 785 const auto &DL = CGM.getModule().getDataLayout(); 786 uint64_t ReductionDataSize = 787 TeamsReductions.empty() 788 ? 0 789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue(); 790 CGBuilderTy &Bld = CGF.Builder; 791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize, 792 C.getLangOpts().OpenMPCUDAReductionBufNum); 793 TeamsReductions.clear(); 794 } 795 796 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D, 797 StringRef ParentName, 798 llvm::Function *&OutlinedFn, 799 llvm::Constant *&OutlinedFnID, 800 bool IsOffloadEntry, 801 const RegionCodeGenTy &CodeGen) { 802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD); 803 EntryFunctionState EST; 804 805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 806 807 // Emit target region as a standalone region. 808 class NVPTXPrePostActionTy : public PrePostActionTy { 809 CGOpenMPRuntimeGPU &RT; 810 CGOpenMPRuntimeGPU::EntryFunctionState &EST; 811 bool IsBareKernel; 812 DataSharingMode Mode; 813 const OMPExecutableDirective &D; 814 815 public: 816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT, 817 CGOpenMPRuntimeGPU::EntryFunctionState &EST, 818 bool IsBareKernel, const OMPExecutableDirective &D) 819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel), 820 Mode(RT.CurrentDataSharingMode), D(D) {} 821 void Enter(CodeGenFunction &CGF) override { 822 if (IsBareKernel) { 823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA; 824 return; 825 } 826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true); 827 // Skip target region initialization. 828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); 829 } 830 void Exit(CodeGenFunction &CGF) override { 831 if (IsBareKernel) { 832 RT.CurrentDataSharingMode = Mode; 833 return; 834 } 835 RT.clearLocThreadIdInsertPt(CGF); 836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true); 837 } 838 } Action(*this, EST, IsBareKernel, D); 839 CodeGen.setAction(Action); 840 IsInTTDRegion = true; 841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, 842 IsOffloadEntry, CodeGen); 843 IsInTTDRegion = false; 844 } 845 846 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction( 847 const OMPExecutableDirective &D, StringRef ParentName, 848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, 849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { 850 if (!IsOffloadEntry) // Nothing to do. 851 return; 852 853 assert(!ParentName.empty() && "Invalid target region parent name!"); 854 855 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); 856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 857 if (Mode || IsBareKernel) 858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 859 CodeGen); 860 else 861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, 862 CodeGen); 863 } 864 865 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM) 866 : CGOpenMPRuntime(CGM) { 867 llvm::OpenMPIRBuilderConfig Config( 868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(), 869 CGM.getLangOpts().OpenMPOffloadMandatory, 870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false, 871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false); 872 OMPBuilder.setConfig(Config); 873 874 if (!CGM.getLangOpts().OpenMPIsTargetDevice) 875 llvm_unreachable("OpenMP can only handle device code."); 876 877 if (CGM.getLangOpts().OpenMPCUDAMode) 878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA; 879 880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder(); 881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty()) 882 return; 883 884 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug, 885 "__omp_rtl_debug_kind"); 886 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription, 887 "__omp_rtl_assume_teams_oversubscription"); 888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription, 889 "__omp_rtl_assume_threads_oversubscription"); 890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, 891 "__omp_rtl_assume_no_thread_state"); 892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, 893 "__omp_rtl_assume_no_nested_parallelism"); 894 } 895 896 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, 897 ProcBindKind ProcBind, 898 SourceLocation Loc) { 899 // Nothing to do. 900 } 901 902 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF, 903 llvm::Value *NumThreads, 904 SourceLocation Loc) { 905 // Nothing to do. 906 } 907 908 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF, 909 const Expr *NumTeams, 910 const Expr *ThreadLimit, 911 SourceLocation Loc) {} 912 913 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction( 914 CodeGenFunction &CGF, const OMPExecutableDirective &D, 915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 916 const RegionCodeGenTy &CodeGen) { 917 // Emit target region as a standalone region. 918 bool PrevIsInTTDRegion = IsInTTDRegion; 919 IsInTTDRegion = false; 920 auto *OutlinedFun = 921 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction( 922 CGF, D, ThreadIDVar, InnermostKind, CodeGen)); 923 IsInTTDRegion = PrevIsInTTDRegion; 924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) { 925 llvm::Function *WrapperFun = 926 createParallelDataSharingWrapper(OutlinedFun, D); 927 WrapperFunctionsMap[OutlinedFun] = WrapperFun; 928 } 929 930 return OutlinedFun; 931 } 932 933 /// Get list of lastprivate variables from the teams distribute ... or 934 /// teams {distribute ...} directives. 935 static void 936 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, 937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 939 "expected teams directive."); 940 const OMPExecutableDirective *Dir = &D; 941 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) { 942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild( 943 Ctx, 944 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers( 945 /*IgnoreCaptured=*/true))) { 946 Dir = dyn_cast_or_null<OMPExecutableDirective>(S); 947 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind())) 948 Dir = nullptr; 949 } 950 } 951 if (!Dir) 952 return; 953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) { 954 for (const Expr *E : C->getVarRefs()) 955 Vars.push_back(getPrivateItem(E)); 956 } 957 } 958 959 /// Get list of reduction variables from the teams ... directives. 960 static void 961 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, 962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) { 963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) && 964 "expected teams directive."); 965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) { 966 for (const Expr *E : C->privates()) 967 Vars.push_back(getPrivateItem(E)); 968 } 969 } 970 971 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction( 972 CodeGenFunction &CGF, const OMPExecutableDirective &D, 973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, 974 const RegionCodeGenTy &CodeGen) { 975 SourceLocation Loc = D.getBeginLoc(); 976 977 const RecordDecl *GlobalizedRD = nullptr; 978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions; 979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields; 980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size; 981 // Globalize team reductions variable unconditionally in all modes. 982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) 983 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); 984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 985 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); 986 if (!LastPrivatesReductions.empty()) { 987 GlobalizedRD = ::buildRecordForGlobalizedVars( 988 CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields, 989 WarpSize); 990 } 991 } else if (!LastPrivatesReductions.empty()) { 992 assert(!TeamAndReductions.first && 993 "Previous team declaration is not expected."); 994 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl(); 995 std::swap(TeamAndReductions.second, LastPrivatesReductions); 996 } 997 998 // Emit target region as a standalone region. 999 class NVPTXPrePostActionTy : public PrePostActionTy { 1000 SourceLocation &Loc; 1001 const RecordDecl *GlobalizedRD; 1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1003 &MappedDeclsFields; 1004 1005 public: 1006 NVPTXPrePostActionTy( 1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD, 1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> 1009 &MappedDeclsFields) 1010 : Loc(Loc), GlobalizedRD(GlobalizedRD), 1011 MappedDeclsFields(MappedDeclsFields) {} 1012 void Enter(CodeGenFunction &CGF) override { 1013 auto &Rt = 1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1015 if (GlobalizedRD) { 1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 1017 I->getSecond().MappedParams = 1018 std::make_unique<CodeGenFunction::OMPMapVars>(); 1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 1020 for (const auto &Pair : MappedDeclsFields) { 1021 assert(Pair.getFirst()->isCanonicalDecl() && 1022 "Expected canonical declaration"); 1023 Data.try_emplace(Pair.getFirst()); 1024 } 1025 } 1026 Rt.emitGenericVarsProlog(CGF, Loc); 1027 } 1028 void Exit(CodeGenFunction &CGF) override { 1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 1030 .emitGenericVarsEpilog(CGF); 1031 } 1032 } Action(Loc, GlobalizedRD, MappedDeclsFields); 1033 CodeGen.setAction(Action); 1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction( 1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen); 1036 1037 return OutlinedFun; 1038 } 1039 1040 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF, 1041 SourceLocation Loc) { 1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1043 return; 1044 1045 CGBuilderTy &Bld = CGF.Builder; 1046 1047 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1048 if (I == FunctionGlobalizedDecls.end()) 1049 return; 1050 1051 for (auto &Rec : I->getSecond().LocalVarData) { 1052 const auto *VD = cast<VarDecl>(Rec.first); 1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first); 1054 QualType VarTy = VD->getType(); 1055 1056 // Get the local allocation of a firstprivate variable before sharing 1057 llvm::Value *ParValue; 1058 if (EscapedParam) { 1059 LValue ParLVal = 1060 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType()); 1061 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc); 1062 } 1063 1064 // Allocate space for the variable to be globalized 1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())}; 1066 llvm::CallBase *VoidPtr = 1067 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1068 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1069 AllocArgs, VD->getName()); 1070 // FIXME: We should use the variables actual alignment as an argument. 1071 VoidPtr->addRetAttr(llvm::Attribute::get( 1072 CGM.getLLVMContext(), llvm::Attribute::Alignment, 1073 CGM.getContext().getTargetInfo().getNewAlign() / 8)); 1074 1075 // Cast the void pointer and get the address of the globalized variable. 1076 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( 1077 VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack"); 1078 LValue VarAddr = 1079 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy); 1080 Rec.second.PrivateAddr = VarAddr.getAddress(); 1081 Rec.second.GlobalizedVal = VoidPtr; 1082 1083 // Assign the local allocation to the newly globalized location. 1084 if (EscapedParam) { 1085 CGF.EmitStoreOfScalar(ParValue, VarAddr); 1086 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress()); 1087 } 1088 if (auto *DI = CGF.getDebugInfo()) 1089 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation())); 1090 } 1091 1092 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) { 1093 const auto *VD = cast<VarDecl>(ValueD); 1094 std::pair<llvm::Value *, llvm::Value *> AddrSizePair = 1095 getKmpcAllocShared(CGF, VD); 1096 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair); 1097 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(), 1098 CGM.getContext().getDeclAlign(VD), 1099 AlignmentSource::Decl); 1100 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress()); 1101 } 1102 I->getSecond().MappedParams->apply(CGF); 1103 } 1104 1105 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF, 1106 const VarDecl *VD) const { 1107 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1108 if (I == FunctionGlobalizedDecls.end()) 1109 return false; 1110 1111 // Check variable declaration is delayed: 1112 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD); 1113 } 1114 1115 std::pair<llvm::Value *, llvm::Value *> 1116 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF, 1117 const VarDecl *VD) { 1118 CGBuilderTy &Bld = CGF.Builder; 1119 1120 // Compute size and alignment. 1121 llvm::Value *Size = CGF.getTypeSize(VD->getType()); 1122 CharUnits Align = CGM.getContext().getDeclAlign(VD); 1123 Size = Bld.CreateNUWAdd( 1124 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1)); 1125 llvm::Value *AlignVal = 1126 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity()); 1127 Size = Bld.CreateUDiv(Size, AlignVal); 1128 Size = Bld.CreateNUWMul(Size, AlignVal); 1129 1130 // Allocate space for this VLA object to be globalized. 1131 llvm::Value *AllocArgs[] = {Size}; 1132 llvm::CallBase *VoidPtr = 1133 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1134 CGM.getModule(), OMPRTL___kmpc_alloc_shared), 1135 AllocArgs, VD->getName()); 1136 VoidPtr->addRetAttr(llvm::Attribute::get( 1137 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity())); 1138 1139 return std::make_pair(VoidPtr, Size); 1140 } 1141 1142 void CGOpenMPRuntimeGPU::getKmpcFreeShared( 1143 CodeGenFunction &CGF, 1144 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) { 1145 // Deallocate the memory for each globalized VLA object 1146 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1147 CGM.getModule(), OMPRTL___kmpc_free_shared), 1148 {AddrSizePair.first, AddrSizePair.second}); 1149 } 1150 1151 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) { 1152 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1153 return; 1154 1155 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 1156 if (I != FunctionGlobalizedDecls.end()) { 1157 // Deallocate the memory for each globalized VLA object that was 1158 // globalized in the prolog (i.e. emitGenericVarsProlog). 1159 for (const auto &AddrSizePair : 1160 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) { 1161 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1162 CGM.getModule(), OMPRTL___kmpc_free_shared), 1163 {AddrSizePair.first, AddrSizePair.second}); 1164 } 1165 // Deallocate the memory for each globalized value 1166 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) { 1167 const auto *VD = cast<VarDecl>(Rec.first); 1168 I->getSecond().MappedParams->restore(CGF); 1169 1170 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal, 1171 CGF.getTypeSize(VD->getType())}; 1172 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1173 CGM.getModule(), OMPRTL___kmpc_free_shared), 1174 FreeArgs); 1175 } 1176 } 1177 } 1178 1179 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, 1180 const OMPExecutableDirective &D, 1181 SourceLocation Loc, 1182 llvm::Function *OutlinedFn, 1183 ArrayRef<llvm::Value *> CapturedVars) { 1184 if (!CGF.HaveInsertPoint()) 1185 return; 1186 1187 bool IsBareKernel = D.getSingleClause<OMPXBareClause>(); 1188 1189 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1190 /*Name=*/".zero.addr"); 1191 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1192 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; 1193 // We don't emit any thread id function call in bare kernel, but because the 1194 // outlined function has a pointer argument, we emit a nullptr here. 1195 if (IsBareKernel) 1196 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy)); 1197 else 1198 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); 1199 OutlinedFnArgs.push_back(ZeroAddr.getPointer()); 1200 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); 1201 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); 1202 } 1203 1204 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF, 1205 SourceLocation Loc, 1206 llvm::Function *OutlinedFn, 1207 ArrayRef<llvm::Value *> CapturedVars, 1208 const Expr *IfCond, 1209 llvm::Value *NumThreads) { 1210 if (!CGF.HaveInsertPoint()) 1211 return; 1212 1213 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, 1214 NumThreads](CodeGenFunction &CGF, 1215 PrePostActionTy &Action) { 1216 CGBuilderTy &Bld = CGF.Builder; 1217 llvm::Value *NumThreadsVal = NumThreads; 1218 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn]; 1219 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy); 1220 if (WFn) 1221 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); 1222 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy); 1223 1224 // Create a private scope that will globalize the arguments 1225 // passed from the outside of the target region. 1226 // TODO: Is that needed? 1227 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); 1228 1229 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca( 1230 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()), 1231 "captured_vars_addrs"); 1232 // There's something to share. 1233 if (!CapturedVars.empty()) { 1234 // Prepare for parallel region. Indicate the outlined function. 1235 ASTContext &Ctx = CGF.getContext(); 1236 unsigned Idx = 0; 1237 for (llvm::Value *V : CapturedVars) { 1238 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx); 1239 llvm::Value *PtrV; 1240 if (V->getType()->isIntegerTy()) 1241 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); 1242 else 1243 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); 1244 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, 1245 Ctx.getPointerType(Ctx.VoidPtrTy)); 1246 ++Idx; 1247 } 1248 } 1249 1250 llvm::Value *IfCondVal = nullptr; 1251 if (IfCond) 1252 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty, 1253 /* isSigned */ false); 1254 else 1255 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1); 1256 1257 if (!NumThreadsVal) 1258 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1); 1259 else 1260 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty); 1261 1262 assert(IfCondVal && "Expected a value"); 1263 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1264 llvm::Value *Args[] = { 1265 RTLoc, 1266 getThreadID(CGF, Loc), 1267 IfCondVal, 1268 NumThreadsVal, 1269 llvm::ConstantInt::get(CGF.Int32Ty, -1), 1270 FnPtr, 1271 ID, 1272 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF), 1273 CGF.VoidPtrPtrTy), 1274 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; 1275 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1276 CGM.getModule(), OMPRTL___kmpc_parallel_51), 1277 Args); 1278 }; 1279 1280 RegionCodeGenTy RCG(ParallelGen); 1281 RCG(CGF); 1282 } 1283 1284 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) { 1285 // Always emit simple barriers! 1286 if (!CGF.HaveInsertPoint()) 1287 return; 1288 // Build call __kmpc_barrier_simple_spmd(nullptr, 0); 1289 // This function does not use parameters, so we can emit just default values. 1290 llvm::Value *Args[] = { 1291 llvm::ConstantPointerNull::get( 1292 cast<llvm::PointerType>(getIdentTyPointerTy())), 1293 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; 1294 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1295 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd), 1296 Args); 1297 } 1298 1299 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF, 1300 SourceLocation Loc, 1301 OpenMPDirectiveKind Kind, bool, 1302 bool) { 1303 // Always emit simple barriers! 1304 if (!CGF.HaveInsertPoint()) 1305 return; 1306 // Build call __kmpc_cancel_barrier(loc, thread_id); 1307 unsigned Flags = getDefaultFlagsForBarriers(Kind); 1308 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags), 1309 getThreadID(CGF, Loc)}; 1310 1311 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1312 CGM.getModule(), OMPRTL___kmpc_barrier), 1313 Args); 1314 } 1315 1316 void CGOpenMPRuntimeGPU::emitCriticalRegion( 1317 CodeGenFunction &CGF, StringRef CriticalName, 1318 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, 1319 const Expr *Hint) { 1320 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop"); 1321 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test"); 1322 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync"); 1323 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body"); 1324 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit"); 1325 1326 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 1327 1328 // Get the mask of active threads in the warp. 1329 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1330 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask)); 1331 // Fetch team-local id of the thread. 1332 llvm::Value *ThreadID = RT.getGPUThreadID(CGF); 1333 1334 // Get the width of the team. 1335 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF); 1336 1337 // Initialize the counter variable for the loop. 1338 QualType Int32Ty = 1339 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0); 1340 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter"); 1341 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty); 1342 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal, 1343 /*isInit=*/true); 1344 1345 // Block checks if loop counter exceeds upper bound. 1346 CGF.EmitBlock(LoopBB); 1347 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1348 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth); 1349 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB); 1350 1351 // Block tests which single thread should execute region, and which threads 1352 // should go straight to synchronisation point. 1353 CGF.EmitBlock(TestBB); 1354 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc); 1355 llvm::Value *CmpThreadToCounter = 1356 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal); 1357 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB); 1358 1359 // Block emits the body of the critical region. 1360 CGF.EmitBlock(BodyBB); 1361 1362 // Output the critical statement. 1363 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc, 1364 Hint); 1365 1366 // After the body surrounded by the critical region, the single executing 1367 // thread will jump to the synchronisation point. 1368 // Block waits for all threads in current team to finish then increments the 1369 // counter variable and returns to the loop. 1370 CGF.EmitBlock(SyncBB); 1371 // Reconverge active threads in the warp. 1372 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1373 CGM.getModule(), OMPRTL___kmpc_syncwarp), 1374 Mask); 1375 1376 llvm::Value *IncCounterVal = 1377 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1)); 1378 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal); 1379 CGF.EmitBranch(LoopBB); 1380 1381 // Block that is reached when all threads in the team complete the region. 1382 CGF.EmitBlock(ExitBB, /*IsFinished=*/true); 1383 } 1384 1385 /// Cast value to the specified type. 1386 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val, 1387 QualType ValTy, QualType CastTy, 1388 SourceLocation Loc) { 1389 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && 1390 "Cast type must sized."); 1391 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && 1392 "Val type must sized."); 1393 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy); 1394 if (ValTy == CastTy) 1395 return Val; 1396 if (CGF.getContext().getTypeSizeInChars(ValTy) == 1397 CGF.getContext().getTypeSizeInChars(CastTy)) 1398 return CGF.Builder.CreateBitCast(Val, LLVMCastTy); 1399 if (CastTy->isIntegerType() && ValTy->isIntegerType()) 1400 return CGF.Builder.CreateIntCast(Val, LLVMCastTy, 1401 CastTy->hasSignedIntegerRepresentation()); 1402 Address CastItem = CGF.CreateMemTemp(CastTy); 1403 Address ValCastItem = CastItem.withElementType(Val->getType()); 1404 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy, 1405 LValueBaseInfo(AlignmentSource::Type), 1406 TBAAAccessInfo()); 1407 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc, 1408 LValueBaseInfo(AlignmentSource::Type), 1409 TBAAAccessInfo()); 1410 } 1411 1412 /// 1413 /// Design of OpenMP reductions on the GPU 1414 /// 1415 /// Consider a typical OpenMP program with one or more reduction 1416 /// clauses: 1417 /// 1418 /// float foo; 1419 /// double bar; 1420 /// #pragma omp target teams distribute parallel for \ 1421 /// reduction(+:foo) reduction(*:bar) 1422 /// for (int i = 0; i < N; i++) { 1423 /// foo += A[i]; bar *= B[i]; 1424 /// } 1425 /// 1426 /// where 'foo' and 'bar' are reduced across all OpenMP threads in 1427 /// all teams. In our OpenMP implementation on the NVPTX device an 1428 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads 1429 /// within a team are mapped to CUDA threads within a threadblock. 1430 /// Our goal is to efficiently aggregate values across all OpenMP 1431 /// threads such that: 1432 /// 1433 /// - the compiler and runtime are logically concise, and 1434 /// - the reduction is performed efficiently in a hierarchical 1435 /// manner as follows: within OpenMP threads in the same warp, 1436 /// across warps in a threadblock, and finally across teams on 1437 /// the NVPTX device. 1438 /// 1439 /// Introduction to Decoupling 1440 /// 1441 /// We would like to decouple the compiler and the runtime so that the 1442 /// latter is ignorant of the reduction variables (number, data types) 1443 /// and the reduction operators. This allows a simpler interface 1444 /// and implementation while still attaining good performance. 1445 /// 1446 /// Pseudocode for the aforementioned OpenMP program generated by the 1447 /// compiler is as follows: 1448 /// 1449 /// 1. Create private copies of reduction variables on each OpenMP 1450 /// thread: 'foo_private', 'bar_private' 1451 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned 1452 /// to it and writes the result in 'foo_private' and 'bar_private' 1453 /// respectively. 1454 /// 3. Call the OpenMP runtime on the GPU to reduce within a team 1455 /// and store the result on the team master: 1456 /// 1457 /// __kmpc_nvptx_parallel_reduce_nowait_v2(..., 1458 /// reduceData, shuffleReduceFn, interWarpCpyFn) 1459 /// 1460 /// where: 1461 /// struct ReduceData { 1462 /// double *foo; 1463 /// double *bar; 1464 /// } reduceData 1465 /// reduceData.foo = &foo_private 1466 /// reduceData.bar = &bar_private 1467 /// 1468 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two 1469 /// auxiliary functions generated by the compiler that operate on 1470 /// variables of type 'ReduceData'. They aid the runtime perform 1471 /// algorithmic steps in a data agnostic manner. 1472 /// 1473 /// 'shuffleReduceFn' is a pointer to a function that reduces data 1474 /// of type 'ReduceData' across two OpenMP threads (lanes) in the 1475 /// same warp. It takes the following arguments as input: 1476 /// 1477 /// a. variable of type 'ReduceData' on the calling lane, 1478 /// b. its lane_id, 1479 /// c. an offset relative to the current lane_id to generate a 1480 /// remote_lane_id. The remote lane contains the second 1481 /// variable of type 'ReduceData' that is to be reduced. 1482 /// d. an algorithm version parameter determining which reduction 1483 /// algorithm to use. 1484 /// 1485 /// 'shuffleReduceFn' retrieves data from the remote lane using 1486 /// efficient GPU shuffle intrinsics and reduces, using the 1487 /// algorithm specified by the 4th parameter, the two operands 1488 /// element-wise. The result is written to the first operand. 1489 /// 1490 /// Different reduction algorithms are implemented in different 1491 /// runtime functions, all calling 'shuffleReduceFn' to perform 1492 /// the essential reduction step. Therefore, based on the 4th 1493 /// parameter, this function behaves slightly differently to 1494 /// cooperate with the runtime to ensure correctness under 1495 /// different circumstances. 1496 /// 1497 /// 'InterWarpCpyFn' is a pointer to a function that transfers 1498 /// reduced variables across warps. It tunnels, through CUDA 1499 /// shared memory, the thread-private data of type 'ReduceData' 1500 /// from lane 0 of each warp to a lane in the first warp. 1501 /// 4. Call the OpenMP runtime on the GPU to reduce across teams. 1502 /// The last team writes the global reduced value to memory. 1503 /// 1504 /// ret = __kmpc_nvptx_teams_reduce_nowait(..., 1505 /// reduceData, shuffleReduceFn, interWarpCpyFn, 1506 /// scratchpadCopyFn, loadAndReduceFn) 1507 /// 1508 /// 'scratchpadCopyFn' is a helper that stores reduced 1509 /// data from the team master to a scratchpad array in 1510 /// global memory. 1511 /// 1512 /// 'loadAndReduceFn' is a helper that loads data from 1513 /// the scratchpad array and reduces it with the input 1514 /// operand. 1515 /// 1516 /// These compiler generated functions hide address 1517 /// calculation and alignment information from the runtime. 1518 /// 5. if ret == 1: 1519 /// The team master of the last team stores the reduced 1520 /// result to the globals in memory. 1521 /// foo += reduceData.foo; bar *= reduceData.bar 1522 /// 1523 /// 1524 /// Warp Reduction Algorithms 1525 /// 1526 /// On the warp level, we have three algorithms implemented in the 1527 /// OpenMP runtime depending on the number of active lanes: 1528 /// 1529 /// Full Warp Reduction 1530 /// 1531 /// The reduce algorithm within a warp where all lanes are active 1532 /// is implemented in the runtime as follows: 1533 /// 1534 /// full_warp_reduce(void *reduce_data, 1535 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1536 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) 1537 /// ShuffleReduceFn(reduce_data, 0, offset, 0); 1538 /// } 1539 /// 1540 /// The algorithm completes in log(2, WARPSIZE) steps. 1541 /// 1542 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is 1543 /// not used therefore we save instructions by not retrieving lane_id 1544 /// from the corresponding special registers. The 4th parameter, which 1545 /// represents the version of the algorithm being used, is set to 0 to 1546 /// signify full warp reduction. 1547 /// 1548 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1549 /// 1550 /// #reduce_elem refers to an element in the local lane's data structure 1551 /// #remote_elem is retrieved from a remote lane 1552 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1553 /// reduce_elem = reduce_elem REDUCE_OP remote_elem; 1554 /// 1555 /// Contiguous Partial Warp Reduction 1556 /// 1557 /// This reduce algorithm is used within a warp where only the first 1558 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the 1559 /// number of OpenMP threads in a parallel region is not a multiple of 1560 /// WARPSIZE. The algorithm is implemented in the runtime as follows: 1561 /// 1562 /// void 1563 /// contiguous_partial_reduce(void *reduce_data, 1564 /// kmp_ShuffleReductFctPtr ShuffleReduceFn, 1565 /// int size, int lane_id) { 1566 /// int curr_size; 1567 /// int offset; 1568 /// curr_size = size; 1569 /// mask = curr_size/2; 1570 /// while (offset>0) { 1571 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1); 1572 /// curr_size = (curr_size+1)/2; 1573 /// offset = curr_size/2; 1574 /// } 1575 /// } 1576 /// 1577 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1578 /// 1579 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1580 /// if (lane_id < offset) 1581 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1582 /// else 1583 /// reduce_elem = remote_elem 1584 /// 1585 /// This algorithm assumes that the data to be reduced are located in a 1586 /// contiguous subset of lanes starting from the first. When there is 1587 /// an odd number of active lanes, the data in the last lane is not 1588 /// aggregated with any other lane's dat but is instead copied over. 1589 /// 1590 /// Dispersed Partial Warp Reduction 1591 /// 1592 /// This algorithm is used within a warp when any discontiguous subset of 1593 /// lanes are active. It is used to implement the reduction operation 1594 /// across lanes in an OpenMP simd region or in a nested parallel region. 1595 /// 1596 /// void 1597 /// dispersed_partial_reduce(void *reduce_data, 1598 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) { 1599 /// int size, remote_id; 1600 /// int logical_lane_id = number_of_active_lanes_before_me() * 2; 1601 /// do { 1602 /// remote_id = next_active_lane_id_right_after_me(); 1603 /// # the above function returns 0 of no active lane 1604 /// # is present right after the current lane. 1605 /// size = number_of_active_lanes_in_this_warp(); 1606 /// logical_lane_id /= 2; 1607 /// ShuffleReduceFn(reduce_data, logical_lane_id, 1608 /// remote_id-1-threadIdx.x, 2); 1609 /// } while (logical_lane_id % 2 == 0 && size > 1); 1610 /// } 1611 /// 1612 /// There is no assumption made about the initial state of the reduction. 1613 /// Any number of lanes (>=1) could be active at any position. The reduction 1614 /// result is returned in the first active lane. 1615 /// 1616 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows: 1617 /// 1618 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); 1619 /// if (lane_id % 2 == 0 && offset > 0) 1620 /// reduce_elem = reduce_elem REDUCE_OP remote_elem 1621 /// else 1622 /// reduce_elem = remote_elem 1623 /// 1624 /// 1625 /// Intra-Team Reduction 1626 /// 1627 /// This function, as implemented in the runtime call 1628 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP 1629 /// threads in a team. It first reduces within a warp using the 1630 /// aforementioned algorithms. We then proceed to gather all such 1631 /// reduced values at the first warp. 1632 /// 1633 /// The runtime makes use of the function 'InterWarpCpyFn', which copies 1634 /// data from each of the "warp master" (zeroth lane of each warp, where 1635 /// warp-reduced data is held) to the zeroth warp. This step reduces (in 1636 /// a mathematical sense) the problem of reduction across warp masters in 1637 /// a block to the problem of warp reduction. 1638 /// 1639 /// 1640 /// Inter-Team Reduction 1641 /// 1642 /// Once a team has reduced its data to a single value, it is stored in 1643 /// a global scratchpad array. Since each team has a distinct slot, this 1644 /// can be done without locking. 1645 /// 1646 /// The last team to write to the scratchpad array proceeds to reduce the 1647 /// scratchpad array. One or more workers in the last team use the helper 1648 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., 1649 /// the k'th worker reduces every k'th element. 1650 /// 1651 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to 1652 /// reduce across workers and compute a globally reduced value. 1653 /// 1654 void CGOpenMPRuntimeGPU::emitReduction( 1655 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates, 1656 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs, 1657 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) { 1658 if (!CGF.HaveInsertPoint()) 1659 return; 1660 1661 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); 1662 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); 1663 1664 ASTContext &C = CGM.getContext(); 1665 1666 if (Options.SimpleReduction) { 1667 assert(!TeamsReduction && !ParallelReduction && 1668 "Invalid reduction selection in emitReduction."); 1669 (void)ParallelReduction; 1670 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, 1671 ReductionOps, Options); 1672 return; 1673 } 1674 1675 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap; 1676 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size()); 1677 int Cnt = 0; 1678 for (const Expr *DRE : Privates) { 1679 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl(); 1680 ++Cnt; 1681 } 1682 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars( 1683 CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1); 1684 1685 if (TeamsReduction) 1686 TeamsReductions.push_back(ReductionRec); 1687 1688 // Source location for the ident struct 1689 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); 1690 1691 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; 1692 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(), 1693 CGF.AllocaInsertPt->getIterator()); 1694 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(), 1695 CGF.Builder.GetInsertPoint()); 1696 llvm::OpenMPIRBuilder::LocationDescription OmpLoc( 1697 CodeGenIP, CGF.SourceLocToDebugLoc(Loc)); 1698 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos; 1699 1700 CodeGenFunction::OMPPrivateScope Scope(CGF); 1701 unsigned Idx = 0; 1702 for (const Expr *Private : Privates) { 1703 llvm::Type *ElementType; 1704 llvm::Value *Variable; 1705 llvm::Value *PrivateVariable; 1706 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr; 1707 ElementType = CGF.ConvertTypeForMem(Private->getType()); 1708 const auto *RHSVar = 1709 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl()); 1710 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF); 1711 const auto *LHSVar = 1712 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl()); 1713 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF); 1714 llvm::OpenMPIRBuilder::EvalKind EvalKind; 1715 switch (CGF.getEvaluationKind(Private->getType())) { 1716 case TEK_Scalar: 1717 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar; 1718 break; 1719 case TEK_Complex: 1720 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex; 1721 break; 1722 case TEK_Aggregate: 1723 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate; 1724 break; 1725 } 1726 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I, 1727 llvm::Value **LHSPtr, llvm::Value **RHSPtr, 1728 llvm::Function *NewFunc) { 1729 CGF.Builder.restoreIP(CodeGenIP); 1730 auto *CurFn = CGF.CurFn; 1731 CGF.CurFn = NewFunc; 1732 1733 *LHSPtr = CGF.GetAddrOfLocalVar( 1734 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl())) 1735 .emitRawPointer(CGF); 1736 *RHSPtr = CGF.GetAddrOfLocalVar( 1737 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl())) 1738 .emitRawPointer(CGF); 1739 1740 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I], 1741 cast<DeclRefExpr>(LHSExprs[I]), 1742 cast<DeclRefExpr>(RHSExprs[I])); 1743 1744 CGF.CurFn = CurFn; 1745 1746 return InsertPointTy(CGF.Builder.GetInsertBlock(), 1747 CGF.Builder.GetInsertPoint()); 1748 }; 1749 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo( 1750 ElementType, Variable, PrivateVariable, EvalKind, 1751 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen)); 1752 Idx++; 1753 } 1754 1755 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = 1756 cantFail(OMPBuilder.createReductionsGPU( 1757 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, 1758 llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, 1759 CGF.getTarget().getGridValue(), 1760 C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); 1761 CGF.Builder.restoreIP(AfterIP); 1762 } 1763 1764 const VarDecl * 1765 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD, 1766 const VarDecl *NativeParam) const { 1767 if (!NativeParam->getType()->isReferenceType()) 1768 return NativeParam; 1769 QualType ArgType = NativeParam->getType(); 1770 QualifierCollector QC; 1771 const Type *NonQualTy = QC.strip(ArgType); 1772 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1773 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) { 1774 if (Attr->getCaptureKind() == OMPC_map) { 1775 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy, 1776 LangAS::opencl_global); 1777 } 1778 } 1779 ArgType = CGM.getContext().getPointerType(PointeeTy); 1780 QC.addRestrict(); 1781 enum { NVPTX_local_addr = 5 }; 1782 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr)); 1783 ArgType = QC.apply(CGM.getContext(), ArgType); 1784 if (isa<ImplicitParamDecl>(NativeParam)) 1785 return ImplicitParamDecl::Create( 1786 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(), 1787 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other); 1788 return ParmVarDecl::Create( 1789 CGM.getContext(), 1790 const_cast<DeclContext *>(NativeParam->getDeclContext()), 1791 NativeParam->getBeginLoc(), NativeParam->getLocation(), 1792 NativeParam->getIdentifier(), ArgType, 1793 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr); 1794 } 1795 1796 Address 1797 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF, 1798 const VarDecl *NativeParam, 1799 const VarDecl *TargetParam) const { 1800 assert(NativeParam != TargetParam && 1801 NativeParam->getType()->isReferenceType() && 1802 "Native arg must not be the same as target arg."); 1803 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam); 1804 QualType NativeParamType = NativeParam->getType(); 1805 QualifierCollector QC; 1806 const Type *NonQualTy = QC.strip(NativeParamType); 1807 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType(); 1808 unsigned NativePointeeAddrSpace = 1809 CGF.getTypes().getTargetAddressSpace(NativePointeeTy); 1810 QualType TargetTy = TargetParam->getType(); 1811 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false, 1812 TargetTy, SourceLocation()); 1813 // Cast to native address space. 1814 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 1815 TargetAddr, 1816 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace)); 1817 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType); 1818 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false, 1819 NativeParamType); 1820 return NativeParamAddr; 1821 } 1822 1823 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall( 1824 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, 1825 ArrayRef<llvm::Value *> Args) const { 1826 SmallVector<llvm::Value *, 4> TargetArgs; 1827 TargetArgs.reserve(Args.size()); 1828 auto *FnType = OutlinedFn.getFunctionType(); 1829 for (unsigned I = 0, E = Args.size(); I < E; ++I) { 1830 if (FnType->isVarArg() && FnType->getNumParams() <= I) { 1831 TargetArgs.append(std::next(Args.begin(), I), Args.end()); 1832 break; 1833 } 1834 llvm::Type *TargetType = FnType->getParamType(I); 1835 llvm::Value *NativeArg = Args[I]; 1836 if (!TargetType->isPointerTy()) { 1837 TargetArgs.emplace_back(NativeArg); 1838 continue; 1839 } 1840 TargetArgs.emplace_back( 1841 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType)); 1842 } 1843 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); 1844 } 1845 1846 /// Emit function which wraps the outline parallel region 1847 /// and controls the arguments which are passed to this function. 1848 /// The wrapper ensures that the outlined function is called 1849 /// with the correct arguments when data is shared. 1850 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper( 1851 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { 1852 ASTContext &Ctx = CGM.getContext(); 1853 const auto &CS = *D.getCapturedStmt(OMPD_parallel); 1854 1855 // Create a function that takes as argument the source thread. 1856 FunctionArgList WrapperArgs; 1857 QualType Int16QTy = 1858 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); 1859 QualType Int32QTy = 1860 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); 1861 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1862 /*Id=*/nullptr, Int16QTy, 1863 ImplicitParamKind::Other); 1864 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), 1865 /*Id=*/nullptr, Int32QTy, 1866 ImplicitParamKind::Other); 1867 WrapperArgs.emplace_back(&ParallelLevelArg); 1868 WrapperArgs.emplace_back(&WrapperArg); 1869 1870 const CGFunctionInfo &CGFI = 1871 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); 1872 1873 auto *Fn = llvm::Function::Create( 1874 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, 1875 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule()); 1876 1877 // Ensure we do not inline the function. This is trivially true for the ones 1878 // passed to __kmpc_fork_call but the ones calles in serialized regions 1879 // could be inlined. This is not a perfect but it is closer to the invariant 1880 // we want, namely, every data environment starts with a new function. 1881 // TODO: We should pass the if condition to the runtime function and do the 1882 // handling there. Much cleaner code. 1883 Fn->addFnAttr(llvm::Attribute::NoInline); 1884 1885 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); 1886 Fn->setLinkage(llvm::GlobalValue::InternalLinkage); 1887 Fn->setDoesNotRecurse(); 1888 1889 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); 1890 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, 1891 D.getBeginLoc(), D.getBeginLoc()); 1892 1893 const auto *RD = CS.getCapturedRecordDecl(); 1894 auto CurField = RD->field_begin(); 1895 1896 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, 1897 /*Name=*/".zero.addr"); 1898 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr); 1899 // Get the array of arguments. 1900 SmallVector<llvm::Value *, 8> Args; 1901 1902 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF)); 1903 Args.emplace_back(ZeroAddr.emitRawPointer(CGF)); 1904 1905 CGBuilderTy &Bld = CGF.Builder; 1906 auto CI = CS.capture_begin(); 1907 1908 // Use global memory for data sharing. 1909 // Handle passing of global args to workers. 1910 RawAddress GlobalArgs = 1911 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); 1912 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); 1913 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; 1914 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( 1915 CGM.getModule(), OMPRTL___kmpc_get_shared_variables), 1916 DataSharingArgs); 1917 1918 // Retrieve the shared variables from the list of references returned 1919 // by the runtime. Pass the variables to the outlined function. 1920 Address SharedArgListAddress = Address::invalid(); 1921 if (CS.capture_size() > 0 || 1922 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1923 SharedArgListAddress = CGF.EmitLoadOfPointer( 1924 GlobalArgs, CGF.getContext() 1925 .getPointerType(CGF.getContext().VoidPtrTy) 1926 .castAs<PointerType>()); 1927 } 1928 unsigned Idx = 0; 1929 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { 1930 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1931 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1932 Src, Bld.getPtrTy(0), CGF.SizeTy); 1933 llvm::Value *LB = CGF.EmitLoadOfScalar( 1934 TypedAddress, 1935 /*Volatile=*/false, 1936 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1937 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc()); 1938 Args.emplace_back(LB); 1939 ++Idx; 1940 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx); 1941 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0), 1942 CGF.SizeTy); 1943 llvm::Value *UB = CGF.EmitLoadOfScalar( 1944 TypedAddress, 1945 /*Volatile=*/false, 1946 CGF.getContext().getPointerType(CGF.getContext().getSizeType()), 1947 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc()); 1948 Args.emplace_back(UB); 1949 ++Idx; 1950 } 1951 if (CS.capture_size() > 0) { 1952 ASTContext &CGFContext = CGF.getContext(); 1953 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { 1954 QualType ElemTy = CurField->getType(); 1955 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx); 1956 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( 1957 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)), 1958 CGF.ConvertTypeForMem(ElemTy)); 1959 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, 1960 /*Volatile=*/false, 1961 CGFContext.getPointerType(ElemTy), 1962 CI->getLocation()); 1963 if (CI->capturesVariableByCopy() && 1964 !CI->getCapturedVar()->getType()->isAnyPointerType()) { 1965 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), 1966 CI->getLocation()); 1967 } 1968 Args.emplace_back(Arg); 1969 } 1970 } 1971 1972 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); 1973 CGF.FinishFunction(); 1974 return Fn; 1975 } 1976 1977 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, 1978 const Decl *D) { 1979 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 1980 return; 1981 1982 assert(D && "Expected function or captured|block decl."); 1983 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && 1984 "Function is registered already."); 1985 assert((!TeamAndReductions.first || TeamAndReductions.first == D) && 1986 "Team is set but not processed."); 1987 const Stmt *Body = nullptr; 1988 bool NeedToDelayGlobalization = false; 1989 if (const auto *FD = dyn_cast<FunctionDecl>(D)) { 1990 Body = FD->getBody(); 1991 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) { 1992 Body = BD->getBody(); 1993 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) { 1994 Body = CD->getBody(); 1995 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP; 1996 if (NeedToDelayGlobalization && 1997 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) 1998 return; 1999 } 2000 if (!Body) 2001 return; 2002 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); 2003 VarChecker.Visit(Body); 2004 const RecordDecl *GlobalizedVarsRecord = 2005 VarChecker.getGlobalizedRecord(IsInTTDRegion); 2006 TeamAndReductions.first = nullptr; 2007 TeamAndReductions.second.clear(); 2008 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls = 2009 VarChecker.getEscapedVariableLengthDecls(); 2010 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls = 2011 VarChecker.getDelayedVariableLengthDecls(); 2012 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() && 2013 DelayedVariableLengthDecls.empty()) 2014 return; 2015 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first; 2016 I->getSecond().MappedParams = 2017 std::make_unique<CodeGenFunction::OMPMapVars>(); 2018 I->getSecond().EscapedParameters.insert( 2019 VarChecker.getEscapedParameters().begin(), 2020 VarChecker.getEscapedParameters().end()); 2021 I->getSecond().EscapedVariableLengthDecls.append( 2022 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end()); 2023 I->getSecond().DelayedVariableLengthDecls.append( 2024 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end()); 2025 DeclToAddrMapTy &Data = I->getSecond().LocalVarData; 2026 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { 2027 assert(VD->isCanonicalDecl() && "Expected canonical declaration"); 2028 Data.try_emplace(VD); 2029 } 2030 if (!NeedToDelayGlobalization) { 2031 emitGenericVarsProlog(CGF, D->getBeginLoc()); 2032 struct GlobalizationScope final : EHScopeStack::Cleanup { 2033 GlobalizationScope() = default; 2034 2035 void Emit(CodeGenFunction &CGF, Flags flags) override { 2036 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()) 2037 .emitGenericVarsEpilog(CGF); 2038 } 2039 }; 2040 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup); 2041 } 2042 } 2043 2044 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF, 2045 const VarDecl *VD) { 2046 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) { 2047 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2048 auto AS = LangAS::Default; 2049 switch (A->getAllocatorType()) { 2050 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2051 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2052 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2053 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2054 break; 2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2056 return Address::invalid(); 2057 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2058 // TODO: implement aupport for user-defined allocators. 2059 return Address::invalid(); 2060 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2061 AS = LangAS::cuda_constant; 2062 break; 2063 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2064 AS = LangAS::cuda_shared; 2065 break; 2066 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2067 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2068 break; 2069 } 2070 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType()); 2071 auto *GV = new llvm::GlobalVariable( 2072 CGM.getModule(), VarTy, /*isConstant=*/false, 2073 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy), 2074 VD->getName(), 2075 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, 2076 CGM.getContext().getTargetAddressSpace(AS)); 2077 CharUnits Align = CGM.getContext().getDeclAlign(VD); 2078 GV->setAlignment(Align.getAsAlign()); 2079 return Address( 2080 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( 2081 GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace( 2082 VD->getType().getAddressSpace()))), 2083 VarTy, Align); 2084 } 2085 2086 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic) 2087 return Address::invalid(); 2088 2089 VD = VD->getCanonicalDecl(); 2090 auto I = FunctionGlobalizedDecls.find(CGF.CurFn); 2091 if (I == FunctionGlobalizedDecls.end()) 2092 return Address::invalid(); 2093 auto VDI = I->getSecond().LocalVarData.find(VD); 2094 if (VDI != I->getSecond().LocalVarData.end()) 2095 return VDI->second.PrivateAddr; 2096 if (VD->hasAttrs()) { 2097 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()), 2098 E(VD->attr_end()); 2099 IT != E; ++IT) { 2100 auto VDI = I->getSecond().LocalVarData.find( 2101 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl()) 2102 ->getCanonicalDecl()); 2103 if (VDI != I->getSecond().LocalVarData.end()) 2104 return VDI->second.PrivateAddr; 2105 } 2106 } 2107 2108 return Address::invalid(); 2109 } 2110 2111 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { 2112 FunctionGlobalizedDecls.erase(CGF.CurFn); 2113 CGOpenMPRuntime::functionFinished(CGF); 2114 } 2115 2116 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk( 2117 CodeGenFunction &CGF, const OMPLoopDirective &S, 2118 OpenMPDistScheduleClauseKind &ScheduleKind, 2119 llvm::Value *&Chunk) const { 2120 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime()); 2121 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) { 2122 ScheduleKind = OMPC_DIST_SCHEDULE_static; 2123 Chunk = CGF.EmitScalarConversion( 2124 RT.getGPUNumThreads(CGF), 2125 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2126 S.getIterationVariable()->getType(), S.getBeginLoc()); 2127 return; 2128 } 2129 CGOpenMPRuntime::getDefaultDistScheduleAndChunk( 2130 CGF, S, ScheduleKind, Chunk); 2131 } 2132 2133 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk( 2134 CodeGenFunction &CGF, const OMPLoopDirective &S, 2135 OpenMPScheduleClauseKind &ScheduleKind, 2136 const Expr *&ChunkExpr) const { 2137 ScheduleKind = OMPC_SCHEDULE_static; 2138 // Chunk size is 1 in this case. 2139 llvm::APInt ChunkSize(32, 1); 2140 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize, 2141 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), 2142 SourceLocation()); 2143 } 2144 2145 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas( 2146 CodeGenFunction &CGF, const OMPExecutableDirective &D) const { 2147 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && 2148 " Expected target-based directive."); 2149 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target); 2150 for (const CapturedStmt::Capture &C : CS->captures()) { 2151 // Capture variables captured by reference in lambdas for target-based 2152 // directives. 2153 if (!C.capturesVariable()) 2154 continue; 2155 const VarDecl *VD = C.getCapturedVar(); 2156 const auto *RD = VD->getType() 2157 .getCanonicalType() 2158 .getNonReferenceType() 2159 ->getAsCXXRecordDecl(); 2160 if (!RD || !RD->isLambda()) 2161 continue; 2162 Address VDAddr = CGF.GetAddrOfLocalVar(VD); 2163 LValue VDLVal; 2164 if (VD->getType().getCanonicalType()->isReferenceType()) 2165 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType()); 2166 else 2167 VDLVal = CGF.MakeAddrLValue( 2168 VDAddr, VD->getType().getCanonicalType().getNonReferenceType()); 2169 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures; 2170 FieldDecl *ThisCapture = nullptr; 2171 RD->getCaptureFields(Captures, ThisCapture); 2172 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) { 2173 LValue ThisLVal = 2174 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture); 2175 llvm::Value *CXXThis = CGF.LoadCXXThis(); 2176 CGF.EmitStoreOfScalar(CXXThis, ThisLVal); 2177 } 2178 for (const LambdaCapture &LC : RD->captures()) { 2179 if (LC.getCaptureKind() != LCK_ByRef) 2180 continue; 2181 const ValueDecl *VD = LC.getCapturedVar(); 2182 // FIXME: For now VD is always a VarDecl because OpenMP does not support 2183 // capturing structured bindings in lambdas yet. 2184 if (!CS->capturesVariable(cast<VarDecl>(VD))) 2185 continue; 2186 auto It = Captures.find(VD); 2187 assert(It != Captures.end() && "Found lambda capture without field."); 2188 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second); 2189 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD)); 2190 if (VD->getType().getCanonicalType()->isReferenceType()) 2191 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr, 2192 VD->getType().getCanonicalType()) 2193 .getAddress(); 2194 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal); 2195 } 2196 } 2197 } 2198 2199 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD, 2200 LangAS &AS) { 2201 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>()) 2202 return false; 2203 const auto *A = VD->getAttr<OMPAllocateDeclAttr>(); 2204 switch(A->getAllocatorType()) { 2205 case OMPAllocateDeclAttr::OMPNullMemAlloc: 2206 case OMPAllocateDeclAttr::OMPDefaultMemAlloc: 2207 // Not supported, fallback to the default mem space. 2208 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc: 2209 case OMPAllocateDeclAttr::OMPCGroupMemAlloc: 2210 case OMPAllocateDeclAttr::OMPHighBWMemAlloc: 2211 case OMPAllocateDeclAttr::OMPLowLatMemAlloc: 2212 case OMPAllocateDeclAttr::OMPThreadMemAlloc: 2213 AS = LangAS::Default; 2214 return true; 2215 case OMPAllocateDeclAttr::OMPConstMemAlloc: 2216 AS = LangAS::cuda_constant; 2217 return true; 2218 case OMPAllocateDeclAttr::OMPPTeamMemAlloc: 2219 AS = LangAS::cuda_shared; 2220 return true; 2221 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc: 2222 llvm_unreachable("Expected predefined allocator for the variables with the " 2223 "static storage."); 2224 } 2225 return false; 2226 } 2227 2228 // Get current OffloadArch and ignore any unknown values 2229 static OffloadArch getOffloadArch(CodeGenModule &CGM) { 2230 if (!CGM.getTarget().hasFeature("ptx")) 2231 return OffloadArch::UNKNOWN; 2232 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) { 2233 if (Feature.getValue()) { 2234 OffloadArch Arch = StringToOffloadArch(Feature.getKey()); 2235 if (Arch != OffloadArch::UNKNOWN) 2236 return Arch; 2237 } 2238 } 2239 return OffloadArch::UNKNOWN; 2240 } 2241 2242 /// Check to see if target architecture supports unified addressing which is 2243 /// a restriction for OpenMP requires clause "unified_shared_memory". 2244 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { 2245 for (const OMPClause *Clause : D->clauselists()) { 2246 if (Clause->getClauseKind() == OMPC_unified_shared_memory) { 2247 OffloadArch Arch = getOffloadArch(CGM); 2248 switch (Arch) { 2249 case OffloadArch::SM_20: 2250 case OffloadArch::SM_21: 2251 case OffloadArch::SM_30: 2252 case OffloadArch::SM_32_: 2253 case OffloadArch::SM_35: 2254 case OffloadArch::SM_37: 2255 case OffloadArch::SM_50: 2256 case OffloadArch::SM_52: 2257 case OffloadArch::SM_53: { 2258 SmallString<256> Buffer; 2259 llvm::raw_svector_ostream Out(Buffer); 2260 Out << "Target architecture " << OffloadArchToString(Arch) 2261 << " does not support unified addressing"; 2262 CGM.Error(Clause->getBeginLoc(), Out.str()); 2263 return; 2264 } 2265 case OffloadArch::SM_60: 2266 case OffloadArch::SM_61: 2267 case OffloadArch::SM_62: 2268 case OffloadArch::SM_70: 2269 case OffloadArch::SM_72: 2270 case OffloadArch::SM_75: 2271 case OffloadArch::SM_80: 2272 case OffloadArch::SM_86: 2273 case OffloadArch::SM_87: 2274 case OffloadArch::SM_89: 2275 case OffloadArch::SM_90: 2276 case OffloadArch::SM_90a: 2277 case OffloadArch::SM_100: 2278 case OffloadArch::SM_100a: 2279 case OffloadArch::SM_101: 2280 case OffloadArch::SM_101a: 2281 case OffloadArch::SM_120: 2282 case OffloadArch::SM_120a: 2283 case OffloadArch::GFX600: 2284 case OffloadArch::GFX601: 2285 case OffloadArch::GFX602: 2286 case OffloadArch::GFX700: 2287 case OffloadArch::GFX701: 2288 case OffloadArch::GFX702: 2289 case OffloadArch::GFX703: 2290 case OffloadArch::GFX704: 2291 case OffloadArch::GFX705: 2292 case OffloadArch::GFX801: 2293 case OffloadArch::GFX802: 2294 case OffloadArch::GFX803: 2295 case OffloadArch::GFX805: 2296 case OffloadArch::GFX810: 2297 case OffloadArch::GFX9_GENERIC: 2298 case OffloadArch::GFX900: 2299 case OffloadArch::GFX902: 2300 case OffloadArch::GFX904: 2301 case OffloadArch::GFX906: 2302 case OffloadArch::GFX908: 2303 case OffloadArch::GFX909: 2304 case OffloadArch::GFX90a: 2305 case OffloadArch::GFX90c: 2306 case OffloadArch::GFX9_4_GENERIC: 2307 case OffloadArch::GFX942: 2308 case OffloadArch::GFX950: 2309 case OffloadArch::GFX10_1_GENERIC: 2310 case OffloadArch::GFX1010: 2311 case OffloadArch::GFX1011: 2312 case OffloadArch::GFX1012: 2313 case OffloadArch::GFX1013: 2314 case OffloadArch::GFX10_3_GENERIC: 2315 case OffloadArch::GFX1030: 2316 case OffloadArch::GFX1031: 2317 case OffloadArch::GFX1032: 2318 case OffloadArch::GFX1033: 2319 case OffloadArch::GFX1034: 2320 case OffloadArch::GFX1035: 2321 case OffloadArch::GFX1036: 2322 case OffloadArch::GFX11_GENERIC: 2323 case OffloadArch::GFX1100: 2324 case OffloadArch::GFX1101: 2325 case OffloadArch::GFX1102: 2326 case OffloadArch::GFX1103: 2327 case OffloadArch::GFX1150: 2328 case OffloadArch::GFX1151: 2329 case OffloadArch::GFX1152: 2330 case OffloadArch::GFX1153: 2331 case OffloadArch::GFX12_GENERIC: 2332 case OffloadArch::GFX1200: 2333 case OffloadArch::GFX1201: 2334 case OffloadArch::GFX1250: 2335 case OffloadArch::AMDGCNSPIRV: 2336 case OffloadArch::Generic: 2337 case OffloadArch::GRANITERAPIDS: 2338 case OffloadArch::BMG_G21: 2339 case OffloadArch::UNUSED: 2340 case OffloadArch::UNKNOWN: 2341 break; 2342 case OffloadArch::LAST: 2343 llvm_unreachable("Unexpected GPU arch."); 2344 } 2345 } 2346 } 2347 CGOpenMPRuntime::processRequiresDirective(D); 2348 } 2349 2350 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) { 2351 CGBuilderTy &Bld = CGF.Builder; 2352 llvm::Module *M = &CGF.CGM.getModule(); 2353 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block"; 2354 llvm::Function *F = M->getFunction(LocSize); 2355 if (!F) { 2356 F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false), 2357 llvm::GlobalVariable::ExternalLinkage, LocSize, 2358 &CGF.CGM.getModule()); 2359 } 2360 return Bld.CreateCall(F, {}, "nvptx_num_threads"); 2361 } 2362 2363 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) { 2364 ArrayRef<llvm::Value *> Args{}; 2365 return CGF.EmitRuntimeCall( 2366 OMPBuilder.getOrCreateRuntimeFunction( 2367 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block), 2368 Args); 2369 } 2370