1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12
13 using namespace clang;
14 using namespace clang::CodeGen;
15
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
19
20 namespace {
21
22 class AMDGPUABIInfo final : public DefaultABIInfo {
23 private:
24 static const unsigned MaxNumRegsForArgsRet = 16;
25
26 unsigned numRegsForType(QualType Ty) const;
27
28 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29 bool isHomogeneousAggregateSmallEnough(const Type *Base,
30 uint64_t Members) const override;
31
32 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
coerceKernelArgumentType(llvm::Type * Ty,unsigned FromAS,unsigned ToAS) const33 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34 unsigned ToAS) const {
35 // Single value types.
36 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38 return llvm::PointerType::get(Ty->getContext(), ToAS);
39 return Ty;
40 }
41
42 public:
AMDGPUABIInfo(CodeGen::CodeGenTypes & CGT)43 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44 DefaultABIInfo(CGT) {}
45
46 ABIArgInfo classifyReturnType(QualType RetTy) const;
47 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
49 unsigned &NumRegsLeft) const;
50
51 void computeInfo(CGFunctionInfo &FI) const override;
52 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
53 AggValueSlot Slot) const override;
54 };
55
isHomogeneousAggregateBaseType(QualType Ty) const56 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
57 return true;
58 }
59
isHomogeneousAggregateSmallEnough(const Type * Base,uint64_t Members) const60 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
61 const Type *Base, uint64_t Members) const {
62 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
63
64 // Homogeneous Aggregates may occupy at most 16 registers.
65 return Members * NumRegs <= MaxNumRegsForArgsRet;
66 }
67
68 /// Estimate number of registers the type will use when passed in registers.
numRegsForType(QualType Ty) const69 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
70 unsigned NumRegs = 0;
71
72 if (const VectorType *VT = Ty->getAs<VectorType>()) {
73 // Compute from the number of elements. The reported size is based on the
74 // in-memory size, which includes the padding 4th element for 3-vectors.
75 QualType EltTy = VT->getElementType();
76 unsigned EltSize = getContext().getTypeSize(EltTy);
77
78 // 16-bit element vectors should be passed as packed.
79 if (EltSize == 16)
80 return (VT->getNumElements() + 1) / 2;
81
82 unsigned EltNumRegs = (EltSize + 31) / 32;
83 return EltNumRegs * VT->getNumElements();
84 }
85
86 if (const RecordType *RT = Ty->getAs<RecordType>()) {
87 const RecordDecl *RD = RT->getDecl();
88 assert(!RD->hasFlexibleArrayMember());
89
90 for (const FieldDecl *Field : RD->fields()) {
91 QualType FieldTy = Field->getType();
92 NumRegs += numRegsForType(FieldTy);
93 }
94
95 return NumRegs;
96 }
97
98 return (getContext().getTypeSize(Ty) + 31) / 32;
99 }
100
computeInfo(CGFunctionInfo & FI) const101 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
102 llvm::CallingConv::ID CC = FI.getCallingConvention();
103
104 if (!getCXXABI().classifyReturnType(FI))
105 FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
106
107 unsigned ArgumentIndex = 0;
108 const unsigned numFixedArguments = FI.getNumRequiredArgs();
109
110 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
111 for (auto &Arg : FI.arguments()) {
112 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
113 Arg.info = classifyKernelArgumentType(Arg.type);
114 } else {
115 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
116 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
117 }
118 }
119 }
120
EmitVAArg(CodeGenFunction & CGF,Address VAListAddr,QualType Ty,AggValueSlot Slot) const121 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
122 QualType Ty, AggValueSlot Slot) const {
123 const bool IsIndirect = false;
124 const bool AllowHigherAlign = false;
125 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
126 getContext().getTypeInfoInChars(Ty),
127 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
128 }
129
classifyReturnType(QualType RetTy) const130 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
131 if (isAggregateTypeForABI(RetTy)) {
132 // Records with non-trivial destructors/copy-constructors should not be
133 // returned by value.
134 if (!getRecordArgABI(RetTy, getCXXABI())) {
135 // Ignore empty structs/unions.
136 if (isEmptyRecord(getContext(), RetTy, true))
137 return ABIArgInfo::getIgnore();
138
139 // Lower single-element structs to just return a regular value.
140 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
141 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
142
143 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
144 const RecordDecl *RD = RT->getDecl();
145 if (RD->hasFlexibleArrayMember())
146 return DefaultABIInfo::classifyReturnType(RetTy);
147 }
148
149 // Pack aggregates <= 4 bytes into single VGPR or pair.
150 uint64_t Size = getContext().getTypeSize(RetTy);
151 if (Size <= 16)
152 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
153
154 if (Size <= 32)
155 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
156
157 if (Size <= 64) {
158 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
159 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
160 }
161
162 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
163 return ABIArgInfo::getDirect();
164 }
165 }
166
167 // Otherwise just do the default thing.
168 return DefaultABIInfo::classifyReturnType(RetTy);
169 }
170
171 /// For kernels all parameters are really passed in a special buffer. It doesn't
172 /// make sense to pass anything byval, so everything must be direct.
classifyKernelArgumentType(QualType Ty) const173 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
174 Ty = useFirstFieldIfTransparentUnion(Ty);
175
176 // TODO: Can we omit empty structs?
177
178 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
179 Ty = QualType(SeltTy, 0);
180
181 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
182 llvm::Type *LTy = OrigLTy;
183 if (getContext().getLangOpts().HIP) {
184 LTy = coerceKernelArgumentType(
185 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
186 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
187 }
188
189 // FIXME: Should also use this for OpenCL, but it requires addressing the
190 // problem of kernels being called.
191 //
192 // FIXME: This doesn't apply the optimization of coercing pointers in structs
193 // to global address space when using byref. This would require implementing a
194 // new kind of coercion of the in-memory type when for indirect arguments.
195 if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
196 isAggregateTypeForABI(Ty)) {
197 return ABIArgInfo::getIndirectAliased(
198 getContext().getTypeAlignInChars(Ty),
199 getContext().getTargetAddressSpace(LangAS::opencl_constant),
200 false /*Realign*/, nullptr /*Padding*/);
201 }
202
203 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
204 // individual elements, which confuses the Clover OpenCL backend; therefore we
205 // have to set it to false here. Other args of getDirect() are just defaults.
206 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
207 }
208
classifyArgumentType(QualType Ty,bool Variadic,unsigned & NumRegsLeft) const209 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
210 unsigned &NumRegsLeft) const {
211 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
212
213 Ty = useFirstFieldIfTransparentUnion(Ty);
214
215 if (Variadic) {
216 return ABIArgInfo::getDirect(/*T=*/nullptr,
217 /*Offset=*/0,
218 /*Padding=*/nullptr,
219 /*CanBeFlattened=*/false,
220 /*Align=*/0);
221 }
222
223 if (isAggregateTypeForABI(Ty)) {
224 // Records with non-trivial destructors/copy-constructors should not be
225 // passed by value.
226 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
227 return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
228
229 // Ignore empty structs/unions.
230 if (isEmptyRecord(getContext(), Ty, true))
231 return ABIArgInfo::getIgnore();
232
233 // Lower single-element structs to just pass a regular value. TODO: We
234 // could do reasonable-size multiple-element structs too, using getExpand(),
235 // though watch out for things like bitfields.
236 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
237 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
238
239 if (const RecordType *RT = Ty->getAs<RecordType>()) {
240 const RecordDecl *RD = RT->getDecl();
241 if (RD->hasFlexibleArrayMember())
242 return DefaultABIInfo::classifyArgumentType(Ty);
243 }
244
245 // Pack aggregates <= 8 bytes into single VGPR or pair.
246 uint64_t Size = getContext().getTypeSize(Ty);
247 if (Size <= 64) {
248 unsigned NumRegs = (Size + 31) / 32;
249 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
250
251 if (Size <= 16)
252 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
253
254 if (Size <= 32)
255 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
256
257 // XXX: Should this be i64 instead, and should the limit increase?
258 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
259 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
260 }
261
262 if (NumRegsLeft > 0) {
263 unsigned NumRegs = numRegsForType(Ty);
264 if (NumRegsLeft >= NumRegs) {
265 NumRegsLeft -= NumRegs;
266 return ABIArgInfo::getDirect();
267 }
268 }
269
270 // Use pass-by-reference in stead of pass-by-value for struct arguments in
271 // function ABI.
272 return ABIArgInfo::getIndirectAliased(
273 getContext().getTypeAlignInChars(Ty),
274 getContext().getTargetAddressSpace(LangAS::opencl_private));
275 }
276
277 // Otherwise just do the default thing.
278 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
279 if (!ArgInfo.isIndirect()) {
280 unsigned NumRegs = numRegsForType(Ty);
281 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
282 }
283
284 return ArgInfo;
285 }
286
287 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
288 public:
AMDGPUTargetCodeGenInfo(CodeGenTypes & CGT)289 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
290 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
291
292 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
293 CodeGenModule &CGM) const;
294
295 void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
296
297 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
298 CodeGen::CodeGenModule &M) const override;
299 unsigned getOpenCLKernelCallingConv() const override;
300
301 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
302 llvm::PointerType *T, QualType QT) const override;
303
getASTAllocaAddressSpace() const304 LangAS getASTAllocaAddressSpace() const override {
305 return getLangASFromTargetAS(
306 getABIInfo().getDataLayout().getAllocaAddrSpace());
307 }
308 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
309 const VarDecl *D) const override;
310 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
311 SyncScope Scope,
312 llvm::AtomicOrdering Ordering,
313 llvm::LLVMContext &Ctx) const override;
314 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
315 llvm::Function *BlockInvokeFunc,
316 llvm::Type *BlockTy) const override;
317 bool shouldEmitStaticExternCAliases() const override;
318 bool shouldEmitDWARFBitFieldSeparators() const override;
319 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
320 };
321 }
322
requiresAMDGPUProtectedVisibility(const Decl * D,llvm::GlobalValue * GV)323 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
324 llvm::GlobalValue *GV) {
325 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
326 return false;
327
328 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
329 (D->hasAttr<OpenCLKernelAttr>() ||
330 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
331 (isa<VarDecl>(D) &&
332 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
333 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
334 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
335 }
336
setFunctionDeclAttributes(const FunctionDecl * FD,llvm::Function * F,CodeGenModule & M) const337 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
338 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
339 const auto *ReqdWGS =
340 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
341 const bool IsOpenCLKernel =
342 M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
343 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
344
345 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
346 if (ReqdWGS || FlatWGS) {
347 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
348 } else if (IsOpenCLKernel || IsHIPKernel) {
349 // By default, restrict the maximum size to a value specified by
350 // --gpu-max-threads-per-block=n or its default value for HIP.
351 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
352 const unsigned DefaultMaxWorkGroupSize =
353 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
354 : M.getLangOpts().GPUMaxThreadsPerBlock;
355 std::string AttrVal =
356 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
357 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
358 }
359
360 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
361 M.handleAMDGPUWavesPerEUAttr(F, Attr);
362
363 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
364 unsigned NumSGPR = Attr->getNumSGPR();
365
366 if (NumSGPR != 0)
367 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
368 }
369
370 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
371 uint32_t NumVGPR = Attr->getNumVGPR();
372
373 if (NumVGPR != 0)
374 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
375 }
376
377 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
378 uint32_t X = Attr->getMaxNumWorkGroupsX()
379 ->EvaluateKnownConstInt(M.getContext())
380 .getExtValue();
381 // Y and Z dimensions default to 1 if not specified
382 uint32_t Y = Attr->getMaxNumWorkGroupsY()
383 ? Attr->getMaxNumWorkGroupsY()
384 ->EvaluateKnownConstInt(M.getContext())
385 .getExtValue()
386 : 1;
387 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
388 ? Attr->getMaxNumWorkGroupsZ()
389 ->EvaluateKnownConstInt(M.getContext())
390 .getExtValue()
391 : 1;
392
393 llvm::SmallString<32> AttrVal;
394 llvm::raw_svector_ostream OS(AttrVal);
395 OS << X << ',' << Y << ',' << Z;
396
397 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
398 }
399 }
400
401 /// Emits control constants used to change per-architecture behaviour in the
402 /// AMDGPU ROCm device libraries.
emitTargetGlobals(CodeGen::CodeGenModule & CGM) const403 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
404 CodeGen::CodeGenModule &CGM) const {
405 StringRef Name = "__oclc_ABI_version";
406 llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
407 if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
408 return;
409
410 if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
411 llvm::CodeObjectVersionKind::COV_None)
412 return;
413
414 auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
415 llvm::Constant *COV = llvm::ConstantInt::get(
416 Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
417
418 // It needs to be constant weak_odr without externally_initialized so that
419 // the load instuction can be eliminated by the IPSCCP.
420 auto *GV = new llvm::GlobalVariable(
421 CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
422 nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
423 CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
424 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
425 GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
426
427 // Replace any external references to this variable with the new global.
428 if (OriginalGV) {
429 OriginalGV->replaceAllUsesWith(GV);
430 GV->takeName(OriginalGV);
431 OriginalGV->eraseFromParent();
432 }
433 }
434
setTargetAttributes(const Decl * D,llvm::GlobalValue * GV,CodeGen::CodeGenModule & M) const435 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
436 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
437 if (requiresAMDGPUProtectedVisibility(D, GV)) {
438 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
439 GV->setDSOLocal(true);
440 }
441
442 if (GV->isDeclaration())
443 return;
444
445 llvm::Function *F = dyn_cast<llvm::Function>(GV);
446 if (!F)
447 return;
448
449 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
450 if (FD)
451 setFunctionDeclAttributes(FD, F, M);
452
453 if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
454 F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
455
456 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
457 F->addFnAttr("amdgpu-ieee", "false");
458 }
459
getOpenCLKernelCallingConv() const460 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
461 return llvm::CallingConv::AMDGPU_KERNEL;
462 }
463
464 // Currently LLVM assumes null pointers always have value 0,
465 // which results in incorrectly transformed IR. Therefore, instead of
466 // emitting null pointers in private and local address spaces, a null
467 // pointer in generic address space is emitted which is casted to a
468 // pointer in local or private address space.
getNullPointer(const CodeGen::CodeGenModule & CGM,llvm::PointerType * PT,QualType QT) const469 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
470 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
471 QualType QT) const {
472 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
473 return llvm::ConstantPointerNull::get(PT);
474
475 auto &Ctx = CGM.getContext();
476 auto NPT = llvm::PointerType::get(
477 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
478 return llvm::ConstantExpr::getAddrSpaceCast(
479 llvm::ConstantPointerNull::get(NPT), PT);
480 }
481
482 LangAS
getGlobalVarAddressSpace(CodeGenModule & CGM,const VarDecl * D) const483 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
484 const VarDecl *D) const {
485 assert(!CGM.getLangOpts().OpenCL &&
486 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
487 "Address space agnostic languages only");
488 LangAS DefaultGlobalAS = getLangASFromTargetAS(
489 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
490 if (!D)
491 return DefaultGlobalAS;
492
493 LangAS AddrSpace = D->getType().getAddressSpace();
494 if (AddrSpace != LangAS::Default)
495 return AddrSpace;
496
497 // Only promote to address space 4 if VarDecl has constant initialization.
498 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
499 D->hasConstantInitialization()) {
500 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
501 return *ConstAS;
502 }
503 return DefaultGlobalAS;
504 }
505
506 llvm::SyncScope::ID
getLLVMSyncScopeID(const LangOptions & LangOpts,SyncScope Scope,llvm::AtomicOrdering Ordering,llvm::LLVMContext & Ctx) const507 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
508 SyncScope Scope,
509 llvm::AtomicOrdering Ordering,
510 llvm::LLVMContext &Ctx) const {
511 std::string Name;
512 switch (Scope) {
513 case SyncScope::HIPSingleThread:
514 case SyncScope::SingleScope:
515 Name = "singlethread";
516 break;
517 case SyncScope::HIPWavefront:
518 case SyncScope::OpenCLSubGroup:
519 case SyncScope::WavefrontScope:
520 Name = "wavefront";
521 break;
522 case SyncScope::HIPWorkgroup:
523 case SyncScope::OpenCLWorkGroup:
524 case SyncScope::WorkgroupScope:
525 Name = "workgroup";
526 break;
527 case SyncScope::HIPAgent:
528 case SyncScope::OpenCLDevice:
529 case SyncScope::DeviceScope:
530 Name = "agent";
531 break;
532 case SyncScope::SystemScope:
533 case SyncScope::HIPSystem:
534 case SyncScope::OpenCLAllSVMDevices:
535 Name = "";
536 break;
537 }
538
539 if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
540 if (!Name.empty())
541 Name = Twine(Twine(Name) + Twine("-")).str();
542
543 Name = Twine(Twine(Name) + Twine("one-as")).str();
544 }
545
546 return Ctx.getOrInsertSyncScopeID(Name);
547 }
548
shouldEmitStaticExternCAliases() const549 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
550 return false;
551 }
552
shouldEmitDWARFBitFieldSeparators() const553 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
554 return true;
555 }
556
setCUDAKernelCallingConvention(const FunctionType * & FT) const557 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
558 const FunctionType *&FT) const {
559 FT = getABIInfo().getContext().adjustFunctionType(
560 FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
561 }
562
563 /// Create an OpenCL kernel for an enqueued block.
564 ///
565 /// The type of the first argument (the block literal) is the struct type
566 /// of the block literal instead of a pointer type. The first argument
567 /// (block literal) is passed directly by value to the kernel. The kernel
568 /// allocates the same type of struct on stack and stores the block literal
569 /// to it and passes its pointer to the block invoke function. The kernel
570 /// has "enqueued-block" function attribute and kernel argument metadata.
createEnqueuedBlockKernel(CodeGenFunction & CGF,llvm::Function * Invoke,llvm::Type * BlockTy) const571 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
572 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
573 auto &Builder = CGF.Builder;
574 auto &C = CGF.getLLVMContext();
575
576 auto *InvokeFT = Invoke->getFunctionType();
577 llvm::SmallVector<llvm::Type *, 2> ArgTys;
578 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
579 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
580 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
581 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
582 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
583 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
584
585 ArgTys.push_back(BlockTy);
586 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
587 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
588 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
589 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
590 AccessQuals.push_back(llvm::MDString::get(C, "none"));
591 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
592 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
593 ArgTys.push_back(InvokeFT->getParamType(I));
594 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
595 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
596 AccessQuals.push_back(llvm::MDString::get(C, "none"));
597 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
598 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
599 ArgNames.push_back(
600 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
601 }
602 std::string Name = Invoke->getName().str() + "_kernel";
603 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
604 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
605 &CGF.CGM.getModule());
606 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
607
608 llvm::AttrBuilder KernelAttrs(C);
609 // FIXME: The invoke isn't applying the right attributes either
610 // FIXME: This is missing setTargetAttributes
611 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
612 KernelAttrs.addAttribute("enqueued-block");
613 F->addFnAttrs(KernelAttrs);
614
615 auto IP = CGF.Builder.saveIP();
616 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
617 Builder.SetInsertPoint(BB);
618 const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
619 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
620 BlockPtr->setAlignment(BlockAlign);
621 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
622 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
623 llvm::SmallVector<llvm::Value *, 2> Args;
624 Args.push_back(Cast);
625 for (llvm::Argument &A : llvm::drop_begin(F->args()))
626 Args.push_back(&A);
627 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
628 call->setCallingConv(Invoke->getCallingConv());
629 Builder.CreateRetVoid();
630 Builder.restoreIP(IP);
631
632 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
633 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
634 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
635 F->setMetadata("kernel_arg_base_type",
636 llvm::MDNode::get(C, ArgBaseTypeNames));
637 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
638 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
639 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
640
641 return F;
642 }
643
handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function * F,const AMDGPUFlatWorkGroupSizeAttr * FlatWGS,const ReqdWorkGroupSizeAttr * ReqdWGS,int32_t * MinThreadsVal,int32_t * MaxThreadsVal)644 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
645 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
646 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
647 int32_t *MaxThreadsVal) {
648 unsigned Min = 0;
649 unsigned Max = 0;
650 if (FlatWGS) {
651 Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
652 Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
653 }
654 if (ReqdWGS && Min == 0 && Max == 0)
655 Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
656
657 if (Min != 0) {
658 assert(Min <= Max && "Min must be less than or equal Max");
659
660 if (MinThreadsVal)
661 *MinThreadsVal = Min;
662 if (MaxThreadsVal)
663 *MaxThreadsVal = Max;
664 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
665 if (F)
666 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
667 } else
668 assert(Max == 0 && "Max must be zero");
669 }
670
handleAMDGPUWavesPerEUAttr(llvm::Function * F,const AMDGPUWavesPerEUAttr * Attr)671 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
672 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
673 unsigned Min =
674 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
675 unsigned Max =
676 Attr->getMax()
677 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
678 : 0;
679
680 if (Min != 0) {
681 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
682
683 std::string AttrVal = llvm::utostr(Min);
684 if (Max != 0)
685 AttrVal = AttrVal + "," + llvm::utostr(Max);
686 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
687 } else
688 assert(Max == 0 && "Max must be zero");
689 }
690
691 std::unique_ptr<TargetCodeGenInfo>
createAMDGPUTargetCodeGenInfo(CodeGenModule & CGM)692 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
693 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
694 }
695