1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
25
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm::omp;
29
30 namespace {
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40 public:
NVPTXActionTy(llvm::FunctionCallee EnterCallee,ArrayRef<llvm::Value * > EnterArgs,llvm::FunctionCallee ExitCallee,ArrayRef<llvm::Value * > ExitArgs,bool Conditional=false)41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
Enter(CodeGenFunction & CGF)47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
Done(CodeGenFunction & CGF)58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
Exit(CodeGenFunction & CGF)63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66 };
67
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII {
73 private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78 public:
ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode & ExecMode,CGOpenMPRuntimeGPU::ExecutionMode EntryMode)79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
~ExecutionRuntimeModesRAII()85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86 };
87
getPrivateItem(const Expr * RefExpr)88 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108 }
109
buildRecordForGlobalizedVars(ASTContext & C,ArrayRef<const ValueDecl * > EscapedDecls,ArrayRef<const ValueDecl * > EscapedDeclsForTeams,llvm::SmallDenseMap<const ValueDecl *,const FieldDecl * > & MappedDeclsFields,int BufSize)110 static RecordDecl *buildRecordForGlobalizedVars(
111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
136 if (Type->isLValueReferenceType())
137 Type = C.getPointerType(Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
141 FieldDecl *Field;
142 if (SingleEscaped.count(VD)) {
143 Field = FieldDecl::Create(
144 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
159 ArraySizeModifier::Normal, 0);
160 }
161 Field = FieldDecl::Create(
162 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
170 IntegerLiteral::Create(C, Align,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
172 SourceLocation()),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(VD, Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180 }
181
182 /// Get the list of variables that can escape their declaration context.
183 class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
markAsEscaped(const ValueDecl * VD)195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(VD);
242 else
243 DelayedVariableLengthDecls.insert(VD);
244 } else
245 EscapedDecls.insert(VD);
246 }
247
VisitValueDecl(const ValueDecl * VD)248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
VisitOpenMPCapturedStmt(const CapturedStmt * S,ArrayRef<OMPClause * > Clauses,bool IsCombinedParallelRegion)260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
280 ArrayRef<const Expr *> Vars;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
buildRecordForGlobalizedVars(bool IsInTTDRegion)307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, WarpSize);
319 }
320
321 public:
CheckVarsEscapingDeclContext(CodeGenFunction & CGF,ArrayRef<const ValueDecl * > TeamsReductions)322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
VisitDeclStmt(const DeclStmt * S)327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332 VisitValueDecl(VD);
333 }
VisitOMPExecutableDirective(const OMPExecutableDirective * D)334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(D->getDirectiveKind()));
353 }
354 }
VisitCapturedStmt(const CapturedStmt * S)355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
VisitLambdaExpr(const LambdaExpr * E)367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
VisitBlockExpr(const BlockExpr * E)381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
VisitCallExpr(const CallExpr * E)393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
VisitDeclRefExpr(const DeclRefExpr * E)410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
VisitUnaryOperator(const UnaryOperator * E)421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
VisitImplicitCastExpr(const ImplicitCastExpr * E)433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
VisitExpr(const Expr * E)445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
VisitStmt(const Stmt * S)456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
getGlobalizedRecord(bool IsInTTDRegion)466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
getFieldForGlobalizedVar(const ValueDecl * VD) const473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
getEscapedDecls() const480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
getEscapedParameters() const486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
getEscapedVariableLengthDecls() const492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
getDelayedVariableLengthDecls() const498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501 };
502 } // anonymous namespace
503
504 CGOpenMPRuntimeGPU::ExecutionMode
getExecutionMode() const505 CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode;
507 }
508
509 CGOpenMPRuntimeGPU::DataSharingMode
getDataSharingMode() const510 CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode;
512 }
513
514 /// Check for inner (nested) SPMD construct, if any
hasNestedSPMDDirective(ASTContext & Ctx,const OMPExecutableDirective & D)515 static bool hasNestedSPMDDirective(ASTContext &Ctx,
516 const OMPExecutableDirective &D) {
517 const auto *CS = D.getInnermostCapturedStmt();
518 const auto *Body =
519 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521
522 if (const auto *NestedDir =
523 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
524 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
525 switch (D.getDirectiveKind()) {
526 case OMPD_target:
527 if (isOpenMPParallelDirective(DKind))
528 return true;
529 if (DKind == OMPD_teams) {
530 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
532 if (!Body)
533 return false;
534 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
535 if (const auto *NND =
536 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
537 DKind = NND->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind))
539 return true;
540 }
541 }
542 return false;
543 case OMPD_target_teams:
544 return isOpenMPParallelDirective(DKind);
545 case OMPD_target_simd:
546 case OMPD_target_parallel:
547 case OMPD_target_parallel_for:
548 case OMPD_target_parallel_for_simd:
549 case OMPD_target_teams_distribute:
550 case OMPD_target_teams_distribute_simd:
551 case OMPD_target_teams_distribute_parallel_for:
552 case OMPD_target_teams_distribute_parallel_for_simd:
553 case OMPD_parallel:
554 case OMPD_for:
555 case OMPD_parallel_for:
556 case OMPD_parallel_master:
557 case OMPD_parallel_sections:
558 case OMPD_for_simd:
559 case OMPD_parallel_for_simd:
560 case OMPD_cancel:
561 case OMPD_cancellation_point:
562 case OMPD_ordered:
563 case OMPD_threadprivate:
564 case OMPD_allocate:
565 case OMPD_task:
566 case OMPD_simd:
567 case OMPD_sections:
568 case OMPD_section:
569 case OMPD_single:
570 case OMPD_master:
571 case OMPD_critical:
572 case OMPD_taskyield:
573 case OMPD_barrier:
574 case OMPD_taskwait:
575 case OMPD_taskgroup:
576 case OMPD_atomic:
577 case OMPD_flush:
578 case OMPD_depobj:
579 case OMPD_scan:
580 case OMPD_teams:
581 case OMPD_target_data:
582 case OMPD_target_exit_data:
583 case OMPD_target_enter_data:
584 case OMPD_distribute:
585 case OMPD_distribute_simd:
586 case OMPD_distribute_parallel_for:
587 case OMPD_distribute_parallel_for_simd:
588 case OMPD_teams_distribute:
589 case OMPD_teams_distribute_simd:
590 case OMPD_teams_distribute_parallel_for:
591 case OMPD_teams_distribute_parallel_for_simd:
592 case OMPD_target_update:
593 case OMPD_declare_simd:
594 case OMPD_declare_variant:
595 case OMPD_begin_declare_variant:
596 case OMPD_end_declare_variant:
597 case OMPD_declare_target:
598 case OMPD_end_declare_target:
599 case OMPD_declare_reduction:
600 case OMPD_declare_mapper:
601 case OMPD_taskloop:
602 case OMPD_taskloop_simd:
603 case OMPD_master_taskloop:
604 case OMPD_master_taskloop_simd:
605 case OMPD_parallel_master_taskloop:
606 case OMPD_parallel_master_taskloop_simd:
607 case OMPD_requires:
608 case OMPD_unknown:
609 default:
610 llvm_unreachable("Unexpected directive.");
611 }
612 }
613
614 return false;
615 }
616
supportsSPMDExecutionMode(ASTContext & Ctx,const OMPExecutableDirective & D)617 static bool supportsSPMDExecutionMode(ASTContext &Ctx,
618 const OMPExecutableDirective &D) {
619 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
620 switch (DirectiveKind) {
621 case OMPD_target:
622 case OMPD_target_teams:
623 return hasNestedSPMDDirective(Ctx, D);
624 case OMPD_target_parallel_loop:
625 case OMPD_target_parallel:
626 case OMPD_target_parallel_for:
627 case OMPD_target_parallel_for_simd:
628 case OMPD_target_teams_distribute_parallel_for:
629 case OMPD_target_teams_distribute_parallel_for_simd:
630 case OMPD_target_simd:
631 case OMPD_target_teams_distribute_simd:
632 return true;
633 case OMPD_target_teams_distribute:
634 return false;
635 case OMPD_target_teams_loop:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
639 return TTLD->canBeParallelFor();
640 return false;
641 case OMPD_parallel:
642 case OMPD_for:
643 case OMPD_parallel_for:
644 case OMPD_parallel_master:
645 case OMPD_parallel_sections:
646 case OMPD_for_simd:
647 case OMPD_parallel_for_simd:
648 case OMPD_cancel:
649 case OMPD_cancellation_point:
650 case OMPD_ordered:
651 case OMPD_threadprivate:
652 case OMPD_allocate:
653 case OMPD_task:
654 case OMPD_simd:
655 case OMPD_sections:
656 case OMPD_section:
657 case OMPD_single:
658 case OMPD_master:
659 case OMPD_critical:
660 case OMPD_taskyield:
661 case OMPD_barrier:
662 case OMPD_taskwait:
663 case OMPD_taskgroup:
664 case OMPD_atomic:
665 case OMPD_flush:
666 case OMPD_depobj:
667 case OMPD_scan:
668 case OMPD_teams:
669 case OMPD_target_data:
670 case OMPD_target_exit_data:
671 case OMPD_target_enter_data:
672 case OMPD_distribute:
673 case OMPD_distribute_simd:
674 case OMPD_distribute_parallel_for:
675 case OMPD_distribute_parallel_for_simd:
676 case OMPD_teams_distribute:
677 case OMPD_teams_distribute_simd:
678 case OMPD_teams_distribute_parallel_for:
679 case OMPD_teams_distribute_parallel_for_simd:
680 case OMPD_target_update:
681 case OMPD_declare_simd:
682 case OMPD_declare_variant:
683 case OMPD_begin_declare_variant:
684 case OMPD_end_declare_variant:
685 case OMPD_declare_target:
686 case OMPD_end_declare_target:
687 case OMPD_declare_reduction:
688 case OMPD_declare_mapper:
689 case OMPD_taskloop:
690 case OMPD_taskloop_simd:
691 case OMPD_master_taskloop:
692 case OMPD_master_taskloop_simd:
693 case OMPD_parallel_master_taskloop:
694 case OMPD_parallel_master_taskloop_simd:
695 case OMPD_requires:
696 case OMPD_unknown:
697 default:
698 break;
699 }
700 llvm_unreachable(
701 "Unknown programming model for OpenMP directive on NVPTX target.");
702 }
703
emitNonSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)704 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
705 StringRef ParentName,
706 llvm::Function *&OutlinedFn,
707 llvm::Constant *&OutlinedFnID,
708 bool IsOffloadEntry,
709 const RegionCodeGenTy &CodeGen) {
710 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
711 EntryFunctionState EST;
712 WrapperFunctionsMap.clear();
713
714 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
715 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy : public PrePostActionTy {
719 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720 const OMPExecutableDirective &D;
721
722 public:
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 const OMPExecutableDirective &D)
725 : EST(EST), D(D) {}
726 void Enter(CodeGenFunction &CGF) override {
727 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
728 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 }
732 void Exit(CodeGenFunction &CGF) override {
733 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.clearLocThreadIdInsertPt(CGF);
735 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 }
737 } Action(EST, D);
738 CodeGen.setAction(Action);
739 IsInTTDRegion = true;
740 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
741 IsOffloadEntry, CodeGen);
742 IsInTTDRegion = false;
743 }
744
emitKernelInit(const OMPExecutableDirective & D,CodeGenFunction & CGF,EntryFunctionState & EST,bool IsSPMD)745 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
746 CodeGenFunction &CGF,
747 EntryFunctionState &EST, bool IsSPMD) {
748 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
749 MaxTeamsVal = -1;
750 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
751 MinTeamsVal, MaxTeamsVal);
752
753 CGBuilderTy &Bld = CGF.Builder;
754 Bld.restoreIP(OMPBuilder.createTargetInit(
755 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
756 if (!IsSPMD)
757 emitGenericVarsProlog(CGF, EST.Loc);
758 }
759
emitKernelDeinit(CodeGenFunction & CGF,EntryFunctionState & EST,bool IsSPMD)760 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
761 EntryFunctionState &EST,
762 bool IsSPMD) {
763 if (!IsSPMD)
764 emitGenericVarsEpilog(CGF);
765
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext &C = CGM.getContext();
768 RecordDecl *StaticRD = C.buildImplicitRecord(
769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
770 StaticRD->startDefinition();
771 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
772 QualType RecTy = C.getRecordType(TeamReductionRec);
773 auto *Field = FieldDecl::Create(
774 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
775 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit);
778 Field->setAccess(AS_public);
779 StaticRD->addDecl(Field);
780 }
781 StaticRD->completeDefinition();
782 QualType StaticTy = C.getRecordType(StaticRD);
783 llvm::Type *LLVMReductionsBufferTy =
784 CGM.getTypes().ConvertTypeForMem(StaticTy);
785 const auto &DL = CGM.getModule().getDataLayout();
786 uint64_t ReductionDataSize =
787 TeamsReductions.empty()
788 ? 0
789 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
790 CGBuilderTy &Bld = CGF.Builder;
791 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
792 C.getLangOpts().OpenMPCUDAReductionBufNum);
793 TeamsReductions.clear();
794 }
795
emitSPMDKernel(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)796 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
797 StringRef ParentName,
798 llvm::Function *&OutlinedFn,
799 llvm::Constant *&OutlinedFnID,
800 bool IsOffloadEntry,
801 const RegionCodeGenTy &CodeGen) {
802 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
803 EntryFunctionState EST;
804
805 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU &RT;
810 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
811 bool IsBareKernel;
812 DataSharingMode Mode;
813 const OMPExecutableDirective &D;
814
815 public:
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
817 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
818 bool IsBareKernel, const OMPExecutableDirective &D)
819 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
820 Mode(RT.CurrentDataSharingMode), D(D) {}
821 void Enter(CodeGenFunction &CGF) override {
822 if (IsBareKernel) {
823 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
824 return;
825 }
826 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 }
830 void Exit(CodeGenFunction &CGF) override {
831 if (IsBareKernel) {
832 RT.CurrentDataSharingMode = Mode;
833 return;
834 }
835 RT.clearLocThreadIdInsertPt(CGF);
836 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 }
838 } Action(*this, EST, IsBareKernel, D);
839 CodeGen.setAction(Action);
840 IsInTTDRegion = true;
841 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
842 IsOffloadEntry, CodeGen);
843 IsInTTDRegion = false;
844 }
845
emitTargetOutlinedFunction(const OMPExecutableDirective & D,StringRef ParentName,llvm::Function * & OutlinedFn,llvm::Constant * & OutlinedFnID,bool IsOffloadEntry,const RegionCodeGenTy & CodeGen)846 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective &D, StringRef ParentName,
848 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
849 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
850 if (!IsOffloadEntry) // Nothing to do.
851 return;
852
853 assert(!ParentName.empty() && "Invalid target region parent name!");
854
855 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
856 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
857 if (Mode || IsBareKernel)
858 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
859 CodeGen);
860 else
861 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
862 CodeGen);
863 }
864
CGOpenMPRuntimeGPU(CodeGenModule & CGM)865 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
866 : CGOpenMPRuntime(CGM) {
867 llvm::OpenMPIRBuilderConfig Config(
868 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
869 CGM.getLangOpts().OpenMPOffloadMandatory,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 OMPBuilder.setConfig(Config);
873
874 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
875 llvm_unreachable("OpenMP can only handle device code.");
876
877 if (CGM.getLangOpts().OpenMPCUDAMode)
878 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879
880 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
881 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
882 return;
883
884 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
885 "__omp_rtl_debug_kind");
886 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
887 "__omp_rtl_assume_teams_oversubscription");
888 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
889 "__omp_rtl_assume_threads_oversubscription");
890 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
891 "__omp_rtl_assume_no_thread_state");
892 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
893 "__omp_rtl_assume_no_nested_parallelism");
894 }
895
emitProcBindClause(CodeGenFunction & CGF,ProcBindKind ProcBind,SourceLocation Loc)896 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
897 ProcBindKind ProcBind,
898 SourceLocation Loc) {
899 // Nothing to do.
900 }
901
emitNumThreadsClause(CodeGenFunction & CGF,llvm::Value * NumThreads,SourceLocation Loc)902 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
903 llvm::Value *NumThreads,
904 SourceLocation Loc) {
905 // Nothing to do.
906 }
907
emitNumTeamsClause(CodeGenFunction & CGF,const Expr * NumTeams,const Expr * ThreadLimit,SourceLocation Loc)908 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
909 const Expr *NumTeams,
910 const Expr *ThreadLimit,
911 SourceLocation Loc) {}
912
emitParallelOutlinedFunction(CodeGenFunction & CGF,const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)913 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914 CodeGenFunction &CGF, const OMPExecutableDirective &D,
915 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
916 const RegionCodeGenTy &CodeGen) {
917 // Emit target region as a standalone region.
918 bool PrevIsInTTDRegion = IsInTTDRegion;
919 IsInTTDRegion = false;
920 auto *OutlinedFun =
921 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
922 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
923 IsInTTDRegion = PrevIsInTTDRegion;
924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
925 llvm::Function *WrapperFun =
926 createParallelDataSharingWrapper(OutlinedFun, D);
927 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
928 }
929
930 return OutlinedFun;
931 }
932
933 /// Get list of lastprivate variables from the teams distribute ... or
934 /// teams {distribute ...} directives.
935 static void
getDistributeLastprivateVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)936 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
937 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
938 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
939 "expected teams directive.");
940 const OMPExecutableDirective *Dir = &D;
941 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
942 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
943 Ctx,
944 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945 /*IgnoreCaptured=*/true))) {
946 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
947 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
948 Dir = nullptr;
949 }
950 }
951 if (!Dir)
952 return;
953 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
954 for (const Expr *E : C->getVarRefs())
955 Vars.push_back(getPrivateItem(E));
956 }
957 }
958
959 /// Get list of reduction variables from the teams ... directives.
960 static void
getTeamsReductionVars(ASTContext & Ctx,const OMPExecutableDirective & D,llvm::SmallVectorImpl<const ValueDecl * > & Vars)961 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
962 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
963 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
964 "expected teams directive.");
965 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
966 for (const Expr *E : C->privates())
967 Vars.push_back(getPrivateItem(E));
968 }
969 }
970
emitTeamsOutlinedFunction(CodeGenFunction & CGF,const OMPExecutableDirective & D,const VarDecl * ThreadIDVar,OpenMPDirectiveKind InnermostKind,const RegionCodeGenTy & CodeGen)971 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972 CodeGenFunction &CGF, const OMPExecutableDirective &D,
973 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
974 const RegionCodeGenTy &CodeGen) {
975 SourceLocation Loc = D.getBeginLoc();
976
977 const RecordDecl *GlobalizedRD = nullptr;
978 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
979 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
980 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
981 // Globalize team reductions variable unconditionally in all modes.
982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
983 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
985 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
986 if (!LastPrivatesReductions.empty()) {
987 GlobalizedRD = ::buildRecordForGlobalizedVars(
988 CGM.getContext(), std::nullopt, LastPrivatesReductions,
989 MappedDeclsFields, WarpSize);
990 }
991 } else if (!LastPrivatesReductions.empty()) {
992 assert(!TeamAndReductions.first &&
993 "Previous team declaration is not expected.");
994 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
995 std::swap(TeamAndReductions.second, LastPrivatesReductions);
996 }
997
998 // Emit target region as a standalone region.
999 class NVPTXPrePostActionTy : public PrePostActionTy {
1000 SourceLocation &Loc;
1001 const RecordDecl *GlobalizedRD;
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields;
1004
1005 public:
1006 NVPTXPrePostActionTy(
1007 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1008 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1009 &MappedDeclsFields)
1010 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1011 MappedDeclsFields(MappedDeclsFields) {}
1012 void Enter(CodeGenFunction &CGF) override {
1013 auto &Rt =
1014 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1015 if (GlobalizedRD) {
1016 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1017 I->getSecond().MappedParams =
1018 std::make_unique<CodeGenFunction::OMPMapVars>();
1019 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1020 for (const auto &Pair : MappedDeclsFields) {
1021 assert(Pair.getFirst()->isCanonicalDecl() &&
1022 "Expected canonical declaration");
1023 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1024 }
1025 }
1026 Rt.emitGenericVarsProlog(CGF, Loc);
1027 }
1028 void Exit(CodeGenFunction &CGF) override {
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1030 .emitGenericVarsEpilog(CGF);
1031 }
1032 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1033 CodeGen.setAction(Action);
1034 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036
1037 return OutlinedFun;
1038 }
1039
emitGenericVarsProlog(CodeGenFunction & CGF,SourceLocation Loc)1040 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041 SourceLocation Loc) {
1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1043 return;
1044
1045 CGBuilderTy &Bld = CGF.Builder;
1046
1047 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1048 if (I == FunctionGlobalizedDecls.end())
1049 return;
1050
1051 for (auto &Rec : I->getSecond().LocalVarData) {
1052 const auto *VD = cast<VarDecl>(Rec.first);
1053 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1054 QualType VarTy = VD->getType();
1055
1056 // Get the local allocation of a firstprivate variable before sharing
1057 llvm::Value *ParValue;
1058 if (EscapedParam) {
1059 LValue ParLVal =
1060 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1061 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1062 }
1063
1064 // Allocate space for the variable to be globalized
1065 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1066 llvm::CallBase *VoidPtr =
1067 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1068 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1069 AllocArgs, VD->getName());
1070 // FIXME: We should use the variables actual alignment as an argument.
1071 VoidPtr->addRetAttr(llvm::Attribute::get(
1072 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1073 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1074
1075 // Cast the void pointer and get the address of the globalized variable.
1076 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1077 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1078 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1079 LValue VarAddr =
1080 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1081 Rec.second.PrivateAddr = VarAddr.getAddress();
1082 Rec.second.GlobalizedVal = VoidPtr;
1083
1084 // Assign the local allocation to the newly globalized location.
1085 if (EscapedParam) {
1086 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1087 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1088 }
1089 if (auto *DI = CGF.getDebugInfo())
1090 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1091 }
1092
1093 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1094 const auto *VD = cast<VarDecl>(ValueD);
1095 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1096 getKmpcAllocShared(CGF, VD);
1097 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1098 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1099 CGM.getContext().getDeclAlign(VD),
1100 AlignmentSource::Decl);
1101 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1102 }
1103 I->getSecond().MappedParams->apply(CGF);
1104 }
1105
isDelayedVariableLengthDecl(CodeGenFunction & CGF,const VarDecl * VD) const1106 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1107 const VarDecl *VD) const {
1108 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1109 if (I == FunctionGlobalizedDecls.end())
1110 return false;
1111
1112 // Check variable declaration is delayed:
1113 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1114 }
1115
1116 std::pair<llvm::Value *, llvm::Value *>
getKmpcAllocShared(CodeGenFunction & CGF,const VarDecl * VD)1117 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1118 const VarDecl *VD) {
1119 CGBuilderTy &Bld = CGF.Builder;
1120
1121 // Compute size and alignment.
1122 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1123 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1124 Size = Bld.CreateNUWAdd(
1125 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1126 llvm::Value *AlignVal =
1127 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1128 Size = Bld.CreateUDiv(Size, AlignVal);
1129 Size = Bld.CreateNUWMul(Size, AlignVal);
1130
1131 // Allocate space for this VLA object to be globalized.
1132 llvm::Value *AllocArgs[] = {Size};
1133 llvm::CallBase *VoidPtr =
1134 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1135 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1136 AllocArgs, VD->getName());
1137 VoidPtr->addRetAttr(llvm::Attribute::get(
1138 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1139
1140 return std::make_pair(VoidPtr, Size);
1141 }
1142
getKmpcFreeShared(CodeGenFunction & CGF,const std::pair<llvm::Value *,llvm::Value * > & AddrSizePair)1143 void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1144 CodeGenFunction &CGF,
1145 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1146 // Deallocate the memory for each globalized VLA object
1147 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1148 CGM.getModule(), OMPRTL___kmpc_free_shared),
1149 {AddrSizePair.first, AddrSizePair.second});
1150 }
1151
emitGenericVarsEpilog(CodeGenFunction & CGF)1152 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1153 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1154 return;
1155
1156 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1157 if (I != FunctionGlobalizedDecls.end()) {
1158 // Deallocate the memory for each globalized VLA object that was
1159 // globalized in the prolog (i.e. emitGenericVarsProlog).
1160 for (const auto &AddrSizePair :
1161 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1162 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1163 CGM.getModule(), OMPRTL___kmpc_free_shared),
1164 {AddrSizePair.first, AddrSizePair.second});
1165 }
1166 // Deallocate the memory for each globalized value
1167 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1168 const auto *VD = cast<VarDecl>(Rec.first);
1169 I->getSecond().MappedParams->restore(CGF);
1170
1171 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1172 CGF.getTypeSize(VD->getType())};
1173 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1174 CGM.getModule(), OMPRTL___kmpc_free_shared),
1175 FreeArgs);
1176 }
1177 }
1178 }
1179
emitTeamsCall(CodeGenFunction & CGF,const OMPExecutableDirective & D,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars)1180 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1181 const OMPExecutableDirective &D,
1182 SourceLocation Loc,
1183 llvm::Function *OutlinedFn,
1184 ArrayRef<llvm::Value *> CapturedVars) {
1185 if (!CGF.HaveInsertPoint())
1186 return;
1187
1188 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1189
1190 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1191 /*Name=*/".zero.addr");
1192 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1193 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1194 // We don't emit any thread id function call in bare kernel, but because the
1195 // outlined function has a pointer argument, we emit a nullptr here.
1196 if (IsBareKernel)
1197 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1198 else
1199 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1200 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1201 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1202 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1203 }
1204
emitParallelCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::Function * OutlinedFn,ArrayRef<llvm::Value * > CapturedVars,const Expr * IfCond,llvm::Value * NumThreads)1205 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1206 SourceLocation Loc,
1207 llvm::Function *OutlinedFn,
1208 ArrayRef<llvm::Value *> CapturedVars,
1209 const Expr *IfCond,
1210 llvm::Value *NumThreads) {
1211 if (!CGF.HaveInsertPoint())
1212 return;
1213
1214 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1215 NumThreads](CodeGenFunction &CGF,
1216 PrePostActionTy &Action) {
1217 CGBuilderTy &Bld = CGF.Builder;
1218 llvm::Value *NumThreadsVal = NumThreads;
1219 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1220 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1221 if (WFn)
1222 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1223 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1224
1225 // Create a private scope that will globalize the arguments
1226 // passed from the outside of the target region.
1227 // TODO: Is that needed?
1228 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1229
1230 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1231 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1232 "captured_vars_addrs");
1233 // There's something to share.
1234 if (!CapturedVars.empty()) {
1235 // Prepare for parallel region. Indicate the outlined function.
1236 ASTContext &Ctx = CGF.getContext();
1237 unsigned Idx = 0;
1238 for (llvm::Value *V : CapturedVars) {
1239 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1240 llvm::Value *PtrV;
1241 if (V->getType()->isIntegerTy())
1242 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1243 else
1244 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1245 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1246 Ctx.getPointerType(Ctx.VoidPtrTy));
1247 ++Idx;
1248 }
1249 }
1250
1251 llvm::Value *IfCondVal = nullptr;
1252 if (IfCond)
1253 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1254 /* isSigned */ false);
1255 else
1256 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1257
1258 if (!NumThreadsVal)
1259 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1260 else
1261 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1262
1263 assert(IfCondVal && "Expected a value");
1264 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1265 llvm::Value *Args[] = {
1266 RTLoc,
1267 getThreadID(CGF, Loc),
1268 IfCondVal,
1269 NumThreadsVal,
1270 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1271 FnPtr,
1272 ID,
1273 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1274 CGF.VoidPtrPtrTy),
1275 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1276 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1278 Args);
1279 };
1280
1281 RegionCodeGenTy RCG(ParallelGen);
1282 RCG(CGF);
1283 }
1284
syncCTAThreads(CodeGenFunction & CGF)1285 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1286 // Always emit simple barriers!
1287 if (!CGF.HaveInsertPoint())
1288 return;
1289 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290 // This function does not use parameters, so we can emit just default values.
1291 llvm::Value *Args[] = {
1292 llvm::ConstantPointerNull::get(
1293 cast<llvm::PointerType>(getIdentTyPointerTy())),
1294 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1295 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1297 Args);
1298 }
1299
emitBarrierCall(CodeGenFunction & CGF,SourceLocation Loc,OpenMPDirectiveKind Kind,bool,bool)1300 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1301 SourceLocation Loc,
1302 OpenMPDirectiveKind Kind, bool,
1303 bool) {
1304 // Always emit simple barriers!
1305 if (!CGF.HaveInsertPoint())
1306 return;
1307 // Build call __kmpc_cancel_barrier(loc, thread_id);
1308 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1309 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1310 getThreadID(CGF, Loc)};
1311
1312 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1313 CGM.getModule(), OMPRTL___kmpc_barrier),
1314 Args);
1315 }
1316
emitCriticalRegion(CodeGenFunction & CGF,StringRef CriticalName,const RegionCodeGenTy & CriticalOpGen,SourceLocation Loc,const Expr * Hint)1317 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1318 CodeGenFunction &CGF, StringRef CriticalName,
1319 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1320 const Expr *Hint) {
1321 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1322 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1323 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1324 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1325 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1326
1327 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1328
1329 // Get the mask of active threads in the warp.
1330 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1331 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1332 // Fetch team-local id of the thread.
1333 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1334
1335 // Get the width of the team.
1336 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1337
1338 // Initialize the counter variable for the loop.
1339 QualType Int32Ty =
1340 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1341 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1342 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1343 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1344 /*isInit=*/true);
1345
1346 // Block checks if loop counter exceeds upper bound.
1347 CGF.EmitBlock(LoopBB);
1348 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1349 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1350 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1351
1352 // Block tests which single thread should execute region, and which threads
1353 // should go straight to synchronisation point.
1354 CGF.EmitBlock(TestBB);
1355 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1356 llvm::Value *CmpThreadToCounter =
1357 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1358 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1359
1360 // Block emits the body of the critical region.
1361 CGF.EmitBlock(BodyBB);
1362
1363 // Output the critical statement.
1364 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1365 Hint);
1366
1367 // After the body surrounded by the critical region, the single executing
1368 // thread will jump to the synchronisation point.
1369 // Block waits for all threads in current team to finish then increments the
1370 // counter variable and returns to the loop.
1371 CGF.EmitBlock(SyncBB);
1372 // Reconverge active threads in the warp.
1373 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1374 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1375 Mask);
1376
1377 llvm::Value *IncCounterVal =
1378 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1379 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1380 CGF.EmitBranch(LoopBB);
1381
1382 // Block that is reached when all threads in the team complete the region.
1383 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1384 }
1385
1386 /// Cast value to the specified type.
castValueToType(CodeGenFunction & CGF,llvm::Value * Val,QualType ValTy,QualType CastTy,SourceLocation Loc)1387 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1388 QualType ValTy, QualType CastTy,
1389 SourceLocation Loc) {
1390 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1391 "Cast type must sized.");
1392 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1393 "Val type must sized.");
1394 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1395 if (ValTy == CastTy)
1396 return Val;
1397 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1398 CGF.getContext().getTypeSizeInChars(CastTy))
1399 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1400 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1401 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1402 CastTy->hasSignedIntegerRepresentation());
1403 Address CastItem = CGF.CreateMemTemp(CastTy);
1404 Address ValCastItem = CastItem.withElementType(Val->getType());
1405 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1406 LValueBaseInfo(AlignmentSource::Type),
1407 TBAAAccessInfo());
1408 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1409 LValueBaseInfo(AlignmentSource::Type),
1410 TBAAAccessInfo());
1411 }
1412
1413 ///
1414 /// Design of OpenMP reductions on the GPU
1415 ///
1416 /// Consider a typical OpenMP program with one or more reduction
1417 /// clauses:
1418 ///
1419 /// float foo;
1420 /// double bar;
1421 /// #pragma omp target teams distribute parallel for \
1422 /// reduction(+:foo) reduction(*:bar)
1423 /// for (int i = 0; i < N; i++) {
1424 /// foo += A[i]; bar *= B[i];
1425 /// }
1426 ///
1427 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428 /// all teams. In our OpenMP implementation on the NVPTX device an
1429 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430 /// within a team are mapped to CUDA threads within a threadblock.
1431 /// Our goal is to efficiently aggregate values across all OpenMP
1432 /// threads such that:
1433 ///
1434 /// - the compiler and runtime are logically concise, and
1435 /// - the reduction is performed efficiently in a hierarchical
1436 /// manner as follows: within OpenMP threads in the same warp,
1437 /// across warps in a threadblock, and finally across teams on
1438 /// the NVPTX device.
1439 ///
1440 /// Introduction to Decoupling
1441 ///
1442 /// We would like to decouple the compiler and the runtime so that the
1443 /// latter is ignorant of the reduction variables (number, data types)
1444 /// and the reduction operators. This allows a simpler interface
1445 /// and implementation while still attaining good performance.
1446 ///
1447 /// Pseudocode for the aforementioned OpenMP program generated by the
1448 /// compiler is as follows:
1449 ///
1450 /// 1. Create private copies of reduction variables on each OpenMP
1451 /// thread: 'foo_private', 'bar_private'
1452 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453 /// to it and writes the result in 'foo_private' and 'bar_private'
1454 /// respectively.
1455 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456 /// and store the result on the team master:
1457 ///
1458 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1460 ///
1461 /// where:
1462 /// struct ReduceData {
1463 /// double *foo;
1464 /// double *bar;
1465 /// } reduceData
1466 /// reduceData.foo = &foo_private
1467 /// reduceData.bar = &bar_private
1468 ///
1469 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470 /// auxiliary functions generated by the compiler that operate on
1471 /// variables of type 'ReduceData'. They aid the runtime perform
1472 /// algorithmic steps in a data agnostic manner.
1473 ///
1474 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1475 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1476 /// same warp. It takes the following arguments as input:
1477 ///
1478 /// a. variable of type 'ReduceData' on the calling lane,
1479 /// b. its lane_id,
1480 /// c. an offset relative to the current lane_id to generate a
1481 /// remote_lane_id. The remote lane contains the second
1482 /// variable of type 'ReduceData' that is to be reduced.
1483 /// d. an algorithm version parameter determining which reduction
1484 /// algorithm to use.
1485 ///
1486 /// 'shuffleReduceFn' retrieves data from the remote lane using
1487 /// efficient GPU shuffle intrinsics and reduces, using the
1488 /// algorithm specified by the 4th parameter, the two operands
1489 /// element-wise. The result is written to the first operand.
1490 ///
1491 /// Different reduction algorithms are implemented in different
1492 /// runtime functions, all calling 'shuffleReduceFn' to perform
1493 /// the essential reduction step. Therefore, based on the 4th
1494 /// parameter, this function behaves slightly differently to
1495 /// cooperate with the runtime to ensure correctness under
1496 /// different circumstances.
1497 ///
1498 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1499 /// reduced variables across warps. It tunnels, through CUDA
1500 /// shared memory, the thread-private data of type 'ReduceData'
1501 /// from lane 0 of each warp to a lane in the first warp.
1502 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503 /// The last team writes the global reduced value to memory.
1504 ///
1505 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1507 /// scratchpadCopyFn, loadAndReduceFn)
1508 ///
1509 /// 'scratchpadCopyFn' is a helper that stores reduced
1510 /// data from the team master to a scratchpad array in
1511 /// global memory.
1512 ///
1513 /// 'loadAndReduceFn' is a helper that loads data from
1514 /// the scratchpad array and reduces it with the input
1515 /// operand.
1516 ///
1517 /// These compiler generated functions hide address
1518 /// calculation and alignment information from the runtime.
1519 /// 5. if ret == 1:
1520 /// The team master of the last team stores the reduced
1521 /// result to the globals in memory.
1522 /// foo += reduceData.foo; bar *= reduceData.bar
1523 ///
1524 ///
1525 /// Warp Reduction Algorithms
1526 ///
1527 /// On the warp level, we have three algorithms implemented in the
1528 /// OpenMP runtime depending on the number of active lanes:
1529 ///
1530 /// Full Warp Reduction
1531 ///
1532 /// The reduce algorithm within a warp where all lanes are active
1533 /// is implemented in the runtime as follows:
1534 ///
1535 /// full_warp_reduce(void *reduce_data,
1536 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1539 /// }
1540 ///
1541 /// The algorithm completes in log(2, WARPSIZE) steps.
1542 ///
1543 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544 /// not used therefore we save instructions by not retrieving lane_id
1545 /// from the corresponding special registers. The 4th parameter, which
1546 /// represents the version of the algorithm being used, is set to 0 to
1547 /// signify full warp reduction.
1548 ///
1549 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1550 ///
1551 /// #reduce_elem refers to an element in the local lane's data structure
1552 /// #remote_elem is retrieved from a remote lane
1553 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1555 ///
1556 /// Contiguous Partial Warp Reduction
1557 ///
1558 /// This reduce algorithm is used within a warp where only the first
1559 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1560 /// number of OpenMP threads in a parallel region is not a multiple of
1561 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1562 ///
1563 /// void
1564 /// contiguous_partial_reduce(void *reduce_data,
1565 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566 /// int size, int lane_id) {
1567 /// int curr_size;
1568 /// int offset;
1569 /// curr_size = size;
1570 /// mask = curr_size/2;
1571 /// while (offset>0) {
1572 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573 /// curr_size = (curr_size+1)/2;
1574 /// offset = curr_size/2;
1575 /// }
1576 /// }
1577 ///
1578 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1579 ///
1580 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581 /// if (lane_id < offset)
1582 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1583 /// else
1584 /// reduce_elem = remote_elem
1585 ///
1586 /// This algorithm assumes that the data to be reduced are located in a
1587 /// contiguous subset of lanes starting from the first. When there is
1588 /// an odd number of active lanes, the data in the last lane is not
1589 /// aggregated with any other lane's dat but is instead copied over.
1590 ///
1591 /// Dispersed Partial Warp Reduction
1592 ///
1593 /// This algorithm is used within a warp when any discontiguous subset of
1594 /// lanes are active. It is used to implement the reduction operation
1595 /// across lanes in an OpenMP simd region or in a nested parallel region.
1596 ///
1597 /// void
1598 /// dispersed_partial_reduce(void *reduce_data,
1599 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600 /// int size, remote_id;
1601 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1602 /// do {
1603 /// remote_id = next_active_lane_id_right_after_me();
1604 /// # the above function returns 0 of no active lane
1605 /// # is present right after the current lane.
1606 /// size = number_of_active_lanes_in_this_warp();
1607 /// logical_lane_id /= 2;
1608 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1609 /// remote_id-1-threadIdx.x, 2);
1610 /// } while (logical_lane_id % 2 == 0 && size > 1);
1611 /// }
1612 ///
1613 /// There is no assumption made about the initial state of the reduction.
1614 /// Any number of lanes (>=1) could be active at any position. The reduction
1615 /// result is returned in the first active lane.
1616 ///
1617 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1618 ///
1619 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620 /// if (lane_id % 2 == 0 && offset > 0)
1621 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1622 /// else
1623 /// reduce_elem = remote_elem
1624 ///
1625 ///
1626 /// Intra-Team Reduction
1627 ///
1628 /// This function, as implemented in the runtime call
1629 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630 /// threads in a team. It first reduces within a warp using the
1631 /// aforementioned algorithms. We then proceed to gather all such
1632 /// reduced values at the first warp.
1633 ///
1634 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635 /// data from each of the "warp master" (zeroth lane of each warp, where
1636 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1637 /// a mathematical sense) the problem of reduction across warp masters in
1638 /// a block to the problem of warp reduction.
1639 ///
1640 ///
1641 /// Inter-Team Reduction
1642 ///
1643 /// Once a team has reduced its data to a single value, it is stored in
1644 /// a global scratchpad array. Since each team has a distinct slot, this
1645 /// can be done without locking.
1646 ///
1647 /// The last team to write to the scratchpad array proceeds to reduce the
1648 /// scratchpad array. One or more workers in the last team use the helper
1649 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650 /// the k'th worker reduces every k'th element.
1651 ///
1652 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653 /// reduce across workers and compute a globally reduced value.
1654 ///
emitReduction(CodeGenFunction & CGF,SourceLocation Loc,ArrayRef<const Expr * > Privates,ArrayRef<const Expr * > LHSExprs,ArrayRef<const Expr * > RHSExprs,ArrayRef<const Expr * > ReductionOps,ReductionOptionsTy Options)1655 void CGOpenMPRuntimeGPU::emitReduction(
1656 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1657 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1658 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1659 if (!CGF.HaveInsertPoint())
1660 return;
1661
1662 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1663 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
1664 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
1665
1666 ASTContext &C = CGM.getContext();
1667
1668 if (Options.SimpleReduction) {
1669 assert(!TeamsReduction && !ParallelReduction &&
1670 "Invalid reduction selection in emitReduction.");
1671 (void)ParallelReduction;
1672 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1673 ReductionOps, Options);
1674 return;
1675 }
1676
1677 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1678 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1679 int Cnt = 0;
1680 for (const Expr *DRE : Privates) {
1681 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
1682 ++Cnt;
1683 }
1684 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1685 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
1686
1687 if (TeamsReduction)
1688 TeamsReductions.push_back(ReductionRec);
1689
1690 // Source location for the ident struct
1691 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1692
1693 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1694 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1695 CGF.AllocaInsertPt->getIterator());
1696 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1697 CGF.Builder.GetInsertPoint());
1698 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1699 CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
1700 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1701
1702 CodeGenFunction::OMPPrivateScope Scope(CGF);
1703 unsigned Idx = 0;
1704 for (const Expr *Private : Privates) {
1705 llvm::Type *ElementType;
1706 llvm::Value *Variable;
1707 llvm::Value *PrivateVariable;
1708 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1709 ElementType = CGF.ConvertTypeForMem(Private->getType());
1710 const auto *RHSVar =
1711 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
1712 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
1713 const auto *LHSVar =
1714 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
1715 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
1716 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1717 switch (CGF.getEvaluationKind(Private->getType())) {
1718 case TEK_Scalar:
1719 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1720 break;
1721 case TEK_Complex:
1722 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1723 break;
1724 case TEK_Aggregate:
1725 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1726 break;
1727 }
1728 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1729 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1730 llvm::Function *NewFunc) {
1731 CGF.Builder.restoreIP(CodeGenIP);
1732 auto *CurFn = CGF.CurFn;
1733 CGF.CurFn = NewFunc;
1734
1735 *LHSPtr = CGF.GetAddrOfLocalVar(
1736 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
1737 .emitRawPointer(CGF);
1738 *RHSPtr = CGF.GetAddrOfLocalVar(
1739 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
1740 .emitRawPointer(CGF);
1741
1742 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
1743 cast<DeclRefExpr>(LHSExprs[I]),
1744 cast<DeclRefExpr>(RHSExprs[I]));
1745
1746 CGF.CurFn = CurFn;
1747
1748 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1749 CGF.Builder.GetInsertPoint());
1750 };
1751 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1752 ElementType, Variable, PrivateVariable, EvalKind,
1753 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1754 Idx++;
1755 }
1756
1757 CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
1758 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
1759 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1760 CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
1761 RTLoc));
1762 return;
1763 }
1764
1765 const VarDecl *
translateParameter(const FieldDecl * FD,const VarDecl * NativeParam) const1766 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1767 const VarDecl *NativeParam) const {
1768 if (!NativeParam->getType()->isReferenceType())
1769 return NativeParam;
1770 QualType ArgType = NativeParam->getType();
1771 QualifierCollector QC;
1772 const Type *NonQualTy = QC.strip(ArgType);
1773 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1774 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1775 if (Attr->getCaptureKind() == OMPC_map) {
1776 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1777 LangAS::opencl_global);
1778 }
1779 }
1780 ArgType = CGM.getContext().getPointerType(PointeeTy);
1781 QC.addRestrict();
1782 enum { NVPTX_local_addr = 5 };
1783 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
1784 ArgType = QC.apply(CGM.getContext(), ArgType);
1785 if (isa<ImplicitParamDecl>(NativeParam))
1786 return ImplicitParamDecl::Create(
1787 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
1788 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
1789 return ParmVarDecl::Create(
1790 CGM.getContext(),
1791 const_cast<DeclContext *>(NativeParam->getDeclContext()),
1792 NativeParam->getBeginLoc(), NativeParam->getLocation(),
1793 NativeParam->getIdentifier(), ArgType,
1794 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1795 }
1796
1797 Address
getParameterAddress(CodeGenFunction & CGF,const VarDecl * NativeParam,const VarDecl * TargetParam) const1798 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1799 const VarDecl *NativeParam,
1800 const VarDecl *TargetParam) const {
1801 assert(NativeParam != TargetParam &&
1802 NativeParam->getType()->isReferenceType() &&
1803 "Native arg must not be the same as target arg.");
1804 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1805 QualType NativeParamType = NativeParam->getType();
1806 QualifierCollector QC;
1807 const Type *NonQualTy = QC.strip(NativeParamType);
1808 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1809 unsigned NativePointeeAddrSpace =
1810 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1811 QualType TargetTy = TargetParam->getType();
1812 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
1813 TargetTy, SourceLocation());
1814 // Cast to native address space.
1815 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1816 TargetAddr,
1817 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1818 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1819 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1820 NativeParamType);
1821 return NativeParamAddr;
1822 }
1823
emitOutlinedFunctionCall(CodeGenFunction & CGF,SourceLocation Loc,llvm::FunctionCallee OutlinedFn,ArrayRef<llvm::Value * > Args) const1824 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1825 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1826 ArrayRef<llvm::Value *> Args) const {
1827 SmallVector<llvm::Value *, 4> TargetArgs;
1828 TargetArgs.reserve(Args.size());
1829 auto *FnType = OutlinedFn.getFunctionType();
1830 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1831 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1832 TargetArgs.append(std::next(Args.begin(), I), Args.end());
1833 break;
1834 }
1835 llvm::Type *TargetType = FnType->getParamType(I);
1836 llvm::Value *NativeArg = Args[I];
1837 if (!TargetType->isPointerTy()) {
1838 TargetArgs.emplace_back(NativeArg);
1839 continue;
1840 }
1841 TargetArgs.emplace_back(
1842 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1843 }
1844 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1845 }
1846
1847 /// Emit function which wraps the outline parallel region
1848 /// and controls the arguments which are passed to this function.
1849 /// The wrapper ensures that the outlined function is called
1850 /// with the correct arguments when data is shared.
createParallelDataSharingWrapper(llvm::Function * OutlinedParallelFn,const OMPExecutableDirective & D)1851 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1853 ASTContext &Ctx = CGM.getContext();
1854 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1855
1856 // Create a function that takes as argument the source thread.
1857 FunctionArgList WrapperArgs;
1858 QualType Int16QTy =
1859 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1860 QualType Int32QTy =
1861 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1862 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1863 /*Id=*/nullptr, Int16QTy,
1864 ImplicitParamKind::Other);
1865 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1866 /*Id=*/nullptr, Int32QTy,
1867 ImplicitParamKind::Other);
1868 WrapperArgs.emplace_back(&ParallelLevelArg);
1869 WrapperArgs.emplace_back(&WrapperArg);
1870
1871 const CGFunctionInfo &CGFI =
1872 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
1873
1874 auto *Fn = llvm::Function::Create(
1875 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1876 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1877
1878 // Ensure we do not inline the function. This is trivially true for the ones
1879 // passed to __kmpc_fork_call but the ones calles in serialized regions
1880 // could be inlined. This is not a perfect but it is closer to the invariant
1881 // we want, namely, every data environment starts with a new function.
1882 // TODO: We should pass the if condition to the runtime function and do the
1883 // handling there. Much cleaner code.
1884 Fn->addFnAttr(llvm::Attribute::NoInline);
1885
1886 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1887 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1888 Fn->setDoesNotRecurse();
1889
1890 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1891 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1892 D.getBeginLoc(), D.getBeginLoc());
1893
1894 const auto *RD = CS.getCapturedRecordDecl();
1895 auto CurField = RD->field_begin();
1896
1897 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1898 /*Name=*/".zero.addr");
1899 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1900 // Get the array of arguments.
1901 SmallVector<llvm::Value *, 8> Args;
1902
1903 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
1904 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1905
1906 CGBuilderTy &Bld = CGF.Builder;
1907 auto CI = CS.capture_begin();
1908
1909 // Use global memory for data sharing.
1910 // Handle passing of global args to workers.
1911 RawAddress GlobalArgs =
1912 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1913 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1914 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1915 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1916 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1917 DataSharingArgs);
1918
1919 // Retrieve the shared variables from the list of references returned
1920 // by the runtime. Pass the variables to the outlined function.
1921 Address SharedArgListAddress = Address::invalid();
1922 if (CS.capture_size() > 0 ||
1923 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1924 SharedArgListAddress = CGF.EmitLoadOfPointer(
1925 GlobalArgs, CGF.getContext()
1926 .getPointerType(CGF.getContext().VoidPtrTy)
1927 .castAs<PointerType>());
1928 }
1929 unsigned Idx = 0;
1930 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1931 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1932 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1933 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1934 llvm::Value *LB = CGF.EmitLoadOfScalar(
1935 TypedAddress,
1936 /*Volatile=*/false,
1937 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1938 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1939 Args.emplace_back(LB);
1940 ++Idx;
1941 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1942 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1943 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
1944 llvm::Value *UB = CGF.EmitLoadOfScalar(
1945 TypedAddress,
1946 /*Volatile=*/false,
1947 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1948 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1949 Args.emplace_back(UB);
1950 ++Idx;
1951 }
1952 if (CS.capture_size() > 0) {
1953 ASTContext &CGFContext = CGF.getContext();
1954 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1955 QualType ElemTy = CurField->getType();
1956 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1957 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1958 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
1959 CGF.ConvertTypeForMem(ElemTy));
1960 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1961 /*Volatile=*/false,
1962 CGFContext.getPointerType(ElemTy),
1963 CI->getLocation());
1964 if (CI->capturesVariableByCopy() &&
1965 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1966 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
1967 CI->getLocation());
1968 }
1969 Args.emplace_back(Arg);
1970 }
1971 }
1972
1973 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
1974 CGF.FinishFunction();
1975 return Fn;
1976 }
1977
emitFunctionProlog(CodeGenFunction & CGF,const Decl * D)1978 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1979 const Decl *D) {
1980 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1981 return;
1982
1983 assert(D && "Expected function or captured|block decl.");
1984 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1985 "Function is registered already.");
1986 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1987 "Team is set but not processed.");
1988 const Stmt *Body = nullptr;
1989 bool NeedToDelayGlobalization = false;
1990 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
1991 Body = FD->getBody();
1992 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
1993 Body = BD->getBody();
1994 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
1995 Body = CD->getBody();
1996 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1997 if (NeedToDelayGlobalization &&
1998 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1999 return;
2000 }
2001 if (!Body)
2002 return;
2003 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2004 VarChecker.Visit(Body);
2005 const RecordDecl *GlobalizedVarsRecord =
2006 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2007 TeamAndReductions.first = nullptr;
2008 TeamAndReductions.second.clear();
2009 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2010 VarChecker.getEscapedVariableLengthDecls();
2011 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2012 VarChecker.getDelayedVariableLengthDecls();
2013 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2014 DelayedVariableLengthDecls.empty())
2015 return;
2016 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2017 I->getSecond().MappedParams =
2018 std::make_unique<CodeGenFunction::OMPMapVars>();
2019 I->getSecond().EscapedParameters.insert(
2020 VarChecker.getEscapedParameters().begin(),
2021 VarChecker.getEscapedParameters().end());
2022 I->getSecond().EscapedVariableLengthDecls.append(
2023 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
2024 I->getSecond().DelayedVariableLengthDecls.append(
2025 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2026 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2027 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2028 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2029 Data.insert(std::make_pair(VD, MappedVarData()));
2030 }
2031 if (!NeedToDelayGlobalization) {
2032 emitGenericVarsProlog(CGF, D->getBeginLoc());
2033 struct GlobalizationScope final : EHScopeStack::Cleanup {
2034 GlobalizationScope() = default;
2035
2036 void Emit(CodeGenFunction &CGF, Flags flags) override {
2037 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2038 .emitGenericVarsEpilog(CGF);
2039 }
2040 };
2041 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2042 }
2043 }
2044
getAddressOfLocalVariable(CodeGenFunction & CGF,const VarDecl * VD)2045 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2046 const VarDecl *VD) {
2047 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2048 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2049 auto AS = LangAS::Default;
2050 switch (A->getAllocatorType()) {
2051 // Use the default allocator here as by default local vars are
2052 // threadlocal.
2053 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2054 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2056 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2057 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2058 // Follow the user decision - use default allocation.
2059 return Address::invalid();
2060 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2061 // TODO: implement aupport for user-defined allocators.
2062 return Address::invalid();
2063 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2064 AS = LangAS::cuda_constant;
2065 break;
2066 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2067 AS = LangAS::cuda_shared;
2068 break;
2069 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2070 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2071 break;
2072 }
2073 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2074 auto *GV = new llvm::GlobalVariable(
2075 CGM.getModule(), VarTy, /*isConstant=*/false,
2076 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2077 VD->getName(),
2078 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2079 CGM.getContext().getTargetAddressSpace(AS));
2080 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2081 GV->setAlignment(Align.getAsAlign());
2082 return Address(
2083 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2084 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
2085 VD->getType().getAddressSpace()))),
2086 VarTy, Align);
2087 }
2088
2089 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2090 return Address::invalid();
2091
2092 VD = VD->getCanonicalDecl();
2093 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2094 if (I == FunctionGlobalizedDecls.end())
2095 return Address::invalid();
2096 auto VDI = I->getSecond().LocalVarData.find(VD);
2097 if (VDI != I->getSecond().LocalVarData.end())
2098 return VDI->second.PrivateAddr;
2099 if (VD->hasAttrs()) {
2100 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2101 E(VD->attr_end());
2102 IT != E; ++IT) {
2103 auto VDI = I->getSecond().LocalVarData.find(
2104 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2105 ->getCanonicalDecl());
2106 if (VDI != I->getSecond().LocalVarData.end())
2107 return VDI->second.PrivateAddr;
2108 }
2109 }
2110
2111 return Address::invalid();
2112 }
2113
functionFinished(CodeGenFunction & CGF)2114 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2115 FunctionGlobalizedDecls.erase(CGF.CurFn);
2116 CGOpenMPRuntime::functionFinished(CGF);
2117 }
2118
getDefaultDistScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPDistScheduleClauseKind & ScheduleKind,llvm::Value * & Chunk) const2119 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2120 CodeGenFunction &CGF, const OMPLoopDirective &S,
2121 OpenMPDistScheduleClauseKind &ScheduleKind,
2122 llvm::Value *&Chunk) const {
2123 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2124 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2125 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2126 Chunk = CGF.EmitScalarConversion(
2127 RT.getGPUNumThreads(CGF),
2128 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2129 S.getIterationVariable()->getType(), S.getBeginLoc());
2130 return;
2131 }
2132 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2133 CGF, S, ScheduleKind, Chunk);
2134 }
2135
getDefaultScheduleAndChunk(CodeGenFunction & CGF,const OMPLoopDirective & S,OpenMPScheduleClauseKind & ScheduleKind,const Expr * & ChunkExpr) const2136 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2137 CodeGenFunction &CGF, const OMPLoopDirective &S,
2138 OpenMPScheduleClauseKind &ScheduleKind,
2139 const Expr *&ChunkExpr) const {
2140 ScheduleKind = OMPC_SCHEDULE_static;
2141 // Chunk size is 1 in this case.
2142 llvm::APInt ChunkSize(32, 1);
2143 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2144 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2145 SourceLocation());
2146 }
2147
adjustTargetSpecificDataForLambdas(CodeGenFunction & CGF,const OMPExecutableDirective & D) const2148 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2149 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2150 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2151 " Expected target-based directive.");
2152 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2153 for (const CapturedStmt::Capture &C : CS->captures()) {
2154 // Capture variables captured by reference in lambdas for target-based
2155 // directives.
2156 if (!C.capturesVariable())
2157 continue;
2158 const VarDecl *VD = C.getCapturedVar();
2159 const auto *RD = VD->getType()
2160 .getCanonicalType()
2161 .getNonReferenceType()
2162 ->getAsCXXRecordDecl();
2163 if (!RD || !RD->isLambda())
2164 continue;
2165 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2166 LValue VDLVal;
2167 if (VD->getType().getCanonicalType()->isReferenceType())
2168 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2169 else
2170 VDLVal = CGF.MakeAddrLValue(
2171 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2172 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2173 FieldDecl *ThisCapture = nullptr;
2174 RD->getCaptureFields(Captures, ThisCapture);
2175 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2176 LValue ThisLVal =
2177 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2178 llvm::Value *CXXThis = CGF.LoadCXXThis();
2179 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2180 }
2181 for (const LambdaCapture &LC : RD->captures()) {
2182 if (LC.getCaptureKind() != LCK_ByRef)
2183 continue;
2184 const ValueDecl *VD = LC.getCapturedVar();
2185 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2186 // capturing structured bindings in lambdas yet.
2187 if (!CS->capturesVariable(cast<VarDecl>(VD)))
2188 continue;
2189 auto It = Captures.find(VD);
2190 assert(It != Captures.end() && "Found lambda capture without field.");
2191 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2192 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2193 if (VD->getType().getCanonicalType()->isReferenceType())
2194 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2195 VD->getType().getCanonicalType())
2196 .getAddress();
2197 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2198 }
2199 }
2200 }
2201
hasAllocateAttributeForGlobalVar(const VarDecl * VD,LangAS & AS)2202 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2203 LangAS &AS) {
2204 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2205 return false;
2206 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2207 switch(A->getAllocatorType()) {
2208 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2209 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2210 // Not supported, fallback to the default mem space.
2211 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2212 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2213 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2214 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2215 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2216 AS = LangAS::Default;
2217 return true;
2218 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2219 AS = LangAS::cuda_constant;
2220 return true;
2221 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2222 AS = LangAS::cuda_shared;
2223 return true;
2224 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2225 llvm_unreachable("Expected predefined allocator for the variables with the "
2226 "static storage.");
2227 }
2228 return false;
2229 }
2230
2231 // Get current OffloadArch and ignore any unknown values
getOffloadArch(CodeGenModule & CGM)2232 static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2233 if (!CGM.getTarget().hasFeature("ptx"))
2234 return OffloadArch::UNKNOWN;
2235 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2236 if (Feature.getValue()) {
2237 OffloadArch Arch = StringToOffloadArch(Feature.getKey());
2238 if (Arch != OffloadArch::UNKNOWN)
2239 return Arch;
2240 }
2241 }
2242 return OffloadArch::UNKNOWN;
2243 }
2244
2245 /// Check to see if target architecture supports unified addressing which is
2246 /// a restriction for OpenMP requires clause "unified_shared_memory".
processRequiresDirective(const OMPRequiresDecl * D)2247 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2248 for (const OMPClause *Clause : D->clauselists()) {
2249 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2250 OffloadArch Arch = getOffloadArch(CGM);
2251 switch (Arch) {
2252 case OffloadArch::SM_20:
2253 case OffloadArch::SM_21:
2254 case OffloadArch::SM_30:
2255 case OffloadArch::SM_32_:
2256 case OffloadArch::SM_35:
2257 case OffloadArch::SM_37:
2258 case OffloadArch::SM_50:
2259 case OffloadArch::SM_52:
2260 case OffloadArch::SM_53: {
2261 SmallString<256> Buffer;
2262 llvm::raw_svector_ostream Out(Buffer);
2263 Out << "Target architecture " << OffloadArchToString(Arch)
2264 << " does not support unified addressing";
2265 CGM.Error(Clause->getBeginLoc(), Out.str());
2266 return;
2267 }
2268 case OffloadArch::SM_60:
2269 case OffloadArch::SM_61:
2270 case OffloadArch::SM_62:
2271 case OffloadArch::SM_70:
2272 case OffloadArch::SM_72:
2273 case OffloadArch::SM_75:
2274 case OffloadArch::SM_80:
2275 case OffloadArch::SM_86:
2276 case OffloadArch::SM_87:
2277 case OffloadArch::SM_89:
2278 case OffloadArch::SM_90:
2279 case OffloadArch::SM_90a:
2280 case OffloadArch::GFX600:
2281 case OffloadArch::GFX601:
2282 case OffloadArch::GFX602:
2283 case OffloadArch::GFX700:
2284 case OffloadArch::GFX701:
2285 case OffloadArch::GFX702:
2286 case OffloadArch::GFX703:
2287 case OffloadArch::GFX704:
2288 case OffloadArch::GFX705:
2289 case OffloadArch::GFX801:
2290 case OffloadArch::GFX802:
2291 case OffloadArch::GFX803:
2292 case OffloadArch::GFX805:
2293 case OffloadArch::GFX810:
2294 case OffloadArch::GFX9_GENERIC:
2295 case OffloadArch::GFX900:
2296 case OffloadArch::GFX902:
2297 case OffloadArch::GFX904:
2298 case OffloadArch::GFX906:
2299 case OffloadArch::GFX908:
2300 case OffloadArch::GFX909:
2301 case OffloadArch::GFX90a:
2302 case OffloadArch::GFX90c:
2303 case OffloadArch::GFX940:
2304 case OffloadArch::GFX941:
2305 case OffloadArch::GFX942:
2306 case OffloadArch::GFX10_1_GENERIC:
2307 case OffloadArch::GFX1010:
2308 case OffloadArch::GFX1011:
2309 case OffloadArch::GFX1012:
2310 case OffloadArch::GFX1013:
2311 case OffloadArch::GFX10_3_GENERIC:
2312 case OffloadArch::GFX1030:
2313 case OffloadArch::GFX1031:
2314 case OffloadArch::GFX1032:
2315 case OffloadArch::GFX1033:
2316 case OffloadArch::GFX1034:
2317 case OffloadArch::GFX1035:
2318 case OffloadArch::GFX1036:
2319 case OffloadArch::GFX11_GENERIC:
2320 case OffloadArch::GFX1100:
2321 case OffloadArch::GFX1101:
2322 case OffloadArch::GFX1102:
2323 case OffloadArch::GFX1103:
2324 case OffloadArch::GFX1150:
2325 case OffloadArch::GFX1151:
2326 case OffloadArch::GFX1152:
2327 case OffloadArch::GFX12_GENERIC:
2328 case OffloadArch::GFX1200:
2329 case OffloadArch::GFX1201:
2330 case OffloadArch::AMDGCNSPIRV:
2331 case OffloadArch::Generic:
2332 case OffloadArch::UNUSED:
2333 case OffloadArch::UNKNOWN:
2334 break;
2335 case OffloadArch::LAST:
2336 llvm_unreachable("Unexpected GPU arch.");
2337 }
2338 }
2339 }
2340 CGOpenMPRuntime::processRequiresDirective(D);
2341 }
2342
getGPUNumThreads(CodeGenFunction & CGF)2343 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2344 CGBuilderTy &Bld = CGF.Builder;
2345 llvm::Module *M = &CGF.CGM.getModule();
2346 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2347 llvm::Function *F = M->getFunction(LocSize);
2348 if (!F) {
2349 F = llvm::Function::Create(
2350 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
2351 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
2352 }
2353 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
2354 }
2355
getGPUThreadID(CodeGenFunction & CGF)2356 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2357 ArrayRef<llvm::Value *> Args{};
2358 return CGF.EmitRuntimeCall(
2359 OMPBuilder.getOrCreateRuntimeFunction(
2360 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2361 Args);
2362 }
2363