1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "llvm/ADT/StringExtras.h"
12 #include "llvm/Support/AMDGPUAddrSpace.h"
13
14 using namespace clang;
15 using namespace clang::CodeGen;
16
17 //===----------------------------------------------------------------------===//
18 // AMDGPU ABI Implementation
19 //===----------------------------------------------------------------------===//
20
21 namespace {
22
23 class AMDGPUABIInfo final : public DefaultABIInfo {
24 private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 unsigned numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
coerceKernelArgumentType(llvm::Type * Ty,unsigned FromAS,unsigned ToAS) const34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(Ty->getContext(), ToAS);
40 return Ty;
41 }
42
43 public:
AMDGPUABIInfo(CodeGen::CodeGenTypes & CGT)44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
getOptimalVectorMemoryType(llvm::FixedVectorType * T,const LangOptions & Opt) const57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63 return T;
64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65 }
66 };
67
isHomogeneousAggregateBaseType(QualType Ty) const68 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70 }
71
isHomogeneousAggregateSmallEnough(const Type * Base,uint64_t Members) const72 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78 }
79
80 /// Estimate number of registers the type will use when passed in registers.
numRegsForType(QualType Ty) const81 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 unsigned NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 unsigned EltSize = getContext().getTypeSize(EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 unsigned EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const RecordType *RT = Ty->getAs<RecordType>()) {
99 const RecordDecl *RD = RT->getDecl();
100 assert(!RD->hasFlexibleArrayMember());
101
102 for (const FieldDecl *Field : RD->fields()) {
103 QualType FieldTy = Field->getType();
104 NumRegs += numRegsForType(FieldTy);
105 }
106
107 return NumRegs;
108 }
109
110 return (getContext().getTypeSize(Ty) + 31) / 32;
111 }
112
computeInfo(CGFunctionInfo & FI) const113 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
114 llvm::CallingConv::ID CC = FI.getCallingConvention();
115
116 if (!getCXXABI().classifyReturnType(FI))
117 FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
118
119 unsigned ArgumentIndex = 0;
120 const unsigned numFixedArguments = FI.getNumRequiredArgs();
121
122 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
123 for (auto &Arg : FI.arguments()) {
124 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
125 Arg.info = classifyKernelArgumentType(Arg.type);
126 } else {
127 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
128 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
129 }
130 }
131 }
132
EmitVAArg(CodeGenFunction & CGF,Address VAListAddr,QualType Ty,AggValueSlot Slot) const133 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
134 QualType Ty, AggValueSlot Slot) const {
135 const bool IsIndirect = false;
136 const bool AllowHigherAlign = false;
137 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
138 getContext().getTypeInfoInChars(Ty),
139 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
140 }
141
classifyReturnType(QualType RetTy) const142 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
143 if (isAggregateTypeForABI(RetTy)) {
144 // Records with non-trivial destructors/copy-constructors should not be
145 // returned by value.
146 if (!getRecordArgABI(RetTy, getCXXABI())) {
147 // Ignore empty structs/unions.
148 if (isEmptyRecord(getContext(), RetTy, true))
149 return ABIArgInfo::getIgnore();
150
151 // Lower single-element structs to just return a regular value.
152 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
153 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
154
155 if (const RecordType *RT = RetTy->getAs<RecordType>()) {
156 const RecordDecl *RD = RT->getDecl();
157 if (RD->hasFlexibleArrayMember())
158 return DefaultABIInfo::classifyReturnType(RetTy);
159 }
160
161 // Pack aggregates <= 4 bytes into single VGPR or pair.
162 uint64_t Size = getContext().getTypeSize(RetTy);
163 if (Size <= 16)
164 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
165
166 if (Size <= 32)
167 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
168
169 if (Size <= 64) {
170 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
171 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
172 }
173
174 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
175 return ABIArgInfo::getDirect();
176 }
177 }
178
179 // Otherwise just do the default thing.
180 return DefaultABIInfo::classifyReturnType(RetTy);
181 }
182
183 /// For kernels all parameters are really passed in a special buffer. It doesn't
184 /// make sense to pass anything byval, so everything must be direct.
classifyKernelArgumentType(QualType Ty) const185 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
186 Ty = useFirstFieldIfTransparentUnion(Ty);
187
188 // TODO: Can we omit empty structs?
189
190 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
191 Ty = QualType(SeltTy, 0);
192
193 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
194 llvm::Type *LTy = OrigLTy;
195 if (getContext().getLangOpts().HIP) {
196 LTy = coerceKernelArgumentType(
197 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
198 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
199 }
200
201 // FIXME: This doesn't apply the optimization of coercing pointers in structs
202 // to global address space when using byref. This would require implementing a
203 // new kind of coercion of the in-memory type when for indirect arguments.
204 if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
205 return ABIArgInfo::getIndirectAliased(
206 getContext().getTypeAlignInChars(Ty),
207 getContext().getTargetAddressSpace(LangAS::opencl_constant),
208 false /*Realign*/, nullptr /*Padding*/);
209 }
210
211 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
212 // individual elements, which confuses the Clover OpenCL backend; therefore we
213 // have to set it to false here. Other args of getDirect() are just defaults.
214 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
215 }
216
classifyArgumentType(QualType Ty,bool Variadic,unsigned & NumRegsLeft) const217 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
218 unsigned &NumRegsLeft) const {
219 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
220
221 Ty = useFirstFieldIfTransparentUnion(Ty);
222
223 if (Variadic) {
224 return ABIArgInfo::getDirect(/*T=*/nullptr,
225 /*Offset=*/0,
226 /*Padding=*/nullptr,
227 /*CanBeFlattened=*/false,
228 /*Align=*/0);
229 }
230
231 if (isAggregateTypeForABI(Ty)) {
232 // Records with non-trivial destructors/copy-constructors should not be
233 // passed by value.
234 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
235 return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
236 RAA == CGCXXABI::RAA_DirectInMemory);
237
238 // Ignore empty structs/unions.
239 if (isEmptyRecord(getContext(), Ty, true))
240 return ABIArgInfo::getIgnore();
241
242 // Lower single-element structs to just pass a regular value. TODO: We
243 // could do reasonable-size multiple-element structs too, using getExpand(),
244 // though watch out for things like bitfields.
245 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
246 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
247
248 if (const RecordType *RT = Ty->getAs<RecordType>()) {
249 const RecordDecl *RD = RT->getDecl();
250 if (RD->hasFlexibleArrayMember())
251 return DefaultABIInfo::classifyArgumentType(Ty);
252 }
253
254 // Pack aggregates <= 8 bytes into single VGPR or pair.
255 uint64_t Size = getContext().getTypeSize(Ty);
256 if (Size <= 64) {
257 unsigned NumRegs = (Size + 31) / 32;
258 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
259
260 if (Size <= 16)
261 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
262
263 if (Size <= 32)
264 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
265
266 // XXX: Should this be i64 instead, and should the limit increase?
267 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
268 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
269 }
270
271 if (NumRegsLeft > 0) {
272 unsigned NumRegs = numRegsForType(Ty);
273 if (NumRegsLeft >= NumRegs) {
274 NumRegsLeft -= NumRegs;
275 return ABIArgInfo::getDirect();
276 }
277 }
278
279 // Use pass-by-reference in stead of pass-by-value for struct arguments in
280 // function ABI.
281 return ABIArgInfo::getIndirectAliased(
282 getContext().getTypeAlignInChars(Ty),
283 getContext().getTargetAddressSpace(LangAS::opencl_private));
284 }
285
286 // Otherwise just do the default thing.
287 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
288 if (!ArgInfo.isIndirect()) {
289 unsigned NumRegs = numRegsForType(Ty);
290 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
291 }
292
293 return ArgInfo;
294 }
295
296 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
297 public:
AMDGPUTargetCodeGenInfo(CodeGenTypes & CGT)298 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
299 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
300
supportsLibCall() const301 bool supportsLibCall() const override { return false; }
302 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
303 CodeGenModule &CGM) const;
304
305 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
306 CodeGen::CodeGenModule &M) const override;
307 unsigned getDeviceKernelCallingConv() const override;
308
309 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
310 llvm::PointerType *T, QualType QT) const override;
311
getASTAllocaAddressSpace() const312 LangAS getASTAllocaAddressSpace() const override {
313 return getLangASFromTargetAS(
314 getABIInfo().getDataLayout().getAllocaAddrSpace());
315 }
316 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
317 const VarDecl *D) const override;
318 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
319 SyncScope Scope,
320 llvm::AtomicOrdering Ordering,
321 llvm::LLVMContext &Ctx) const override;
322 void setTargetAtomicMetadata(CodeGenFunction &CGF,
323 llvm::Instruction &AtomicInst,
324 const AtomicExpr *Expr = nullptr) const override;
325 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
326 llvm::Function *BlockInvokeFunc,
327 llvm::Type *BlockTy) const override;
328 bool shouldEmitStaticExternCAliases() const override;
329 bool shouldEmitDWARFBitFieldSeparators() const override;
330 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
331 };
332 }
333
requiresAMDGPUProtectedVisibility(const Decl * D,llvm::GlobalValue * GV)334 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
335 llvm::GlobalValue *GV) {
336 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
337 return false;
338
339 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
340 (D->hasAttr<DeviceKernelAttr>() ||
341 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
342 (isa<VarDecl>(D) &&
343 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
344 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
345 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
346 }
347
setFunctionDeclAttributes(const FunctionDecl * FD,llvm::Function * F,CodeGenModule & M) const348 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
349 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
350 const auto *ReqdWGS =
351 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
352 const bool IsOpenCLKernel =
353 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
354 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
355
356 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
357 if (ReqdWGS || FlatWGS) {
358 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
359 } else if (IsOpenCLKernel || IsHIPKernel) {
360 // By default, restrict the maximum size to a value specified by
361 // --gpu-max-threads-per-block=n or its default value for HIP.
362 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
363 const unsigned DefaultMaxWorkGroupSize =
364 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
365 : M.getLangOpts().GPUMaxThreadsPerBlock;
366 std::string AttrVal =
367 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
368 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
369 }
370
371 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
372 M.handleAMDGPUWavesPerEUAttr(F, Attr);
373
374 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
375 unsigned NumSGPR = Attr->getNumSGPR();
376
377 if (NumSGPR != 0)
378 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
379 }
380
381 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
382 uint32_t NumVGPR = Attr->getNumVGPR();
383
384 if (NumVGPR != 0)
385 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
386 }
387
388 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
389 uint32_t X = Attr->getMaxNumWorkGroupsX()
390 ->EvaluateKnownConstInt(M.getContext())
391 .getExtValue();
392 // Y and Z dimensions default to 1 if not specified
393 uint32_t Y = Attr->getMaxNumWorkGroupsY()
394 ? Attr->getMaxNumWorkGroupsY()
395 ->EvaluateKnownConstInt(M.getContext())
396 .getExtValue()
397 : 1;
398 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
399 ? Attr->getMaxNumWorkGroupsZ()
400 ->EvaluateKnownConstInt(M.getContext())
401 .getExtValue()
402 : 1;
403
404 llvm::SmallString<32> AttrVal;
405 llvm::raw_svector_ostream OS(AttrVal);
406 OS << X << ',' << Y << ',' << Z;
407
408 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
409 }
410 }
411
setTargetAttributes(const Decl * D,llvm::GlobalValue * GV,CodeGen::CodeGenModule & M) const412 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
413 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
414 if (requiresAMDGPUProtectedVisibility(D, GV)) {
415 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
416 GV->setDSOLocal(true);
417 }
418
419 if (GV->isDeclaration())
420 return;
421
422 llvm::Function *F = dyn_cast<llvm::Function>(GV);
423 if (!F)
424 return;
425
426 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
427 if (FD)
428 setFunctionDeclAttributes(FD, F, M);
429
430 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
431 F->addFnAttr("amdgpu-ieee", "false");
432 }
433
getDeviceKernelCallingConv() const434 unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
435 return llvm::CallingConv::AMDGPU_KERNEL;
436 }
437
438 // Currently LLVM assumes null pointers always have value 0,
439 // which results in incorrectly transformed IR. Therefore, instead of
440 // emitting null pointers in private and local address spaces, a null
441 // pointer in generic address space is emitted which is casted to a
442 // pointer in local or private address space.
getNullPointer(const CodeGen::CodeGenModule & CGM,llvm::PointerType * PT,QualType QT) const443 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
444 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
445 QualType QT) const {
446 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
447 return llvm::ConstantPointerNull::get(PT);
448
449 auto &Ctx = CGM.getContext();
450 auto NPT = llvm::PointerType::get(
451 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
452 return llvm::ConstantExpr::getAddrSpaceCast(
453 llvm::ConstantPointerNull::get(NPT), PT);
454 }
455
456 LangAS
getGlobalVarAddressSpace(CodeGenModule & CGM,const VarDecl * D) const457 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
458 const VarDecl *D) const {
459 assert(!CGM.getLangOpts().OpenCL &&
460 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
461 "Address space agnostic languages only");
462 LangAS DefaultGlobalAS = getLangASFromTargetAS(
463 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
464 if (!D)
465 return DefaultGlobalAS;
466
467 LangAS AddrSpace = D->getType().getAddressSpace();
468 if (AddrSpace != LangAS::Default)
469 return AddrSpace;
470
471 // Only promote to address space 4 if VarDecl has constant initialization.
472 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
473 D->hasConstantInitialization()) {
474 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
475 return *ConstAS;
476 }
477 return DefaultGlobalAS;
478 }
479
480 llvm::SyncScope::ID
getLLVMSyncScopeID(const LangOptions & LangOpts,SyncScope Scope,llvm::AtomicOrdering Ordering,llvm::LLVMContext & Ctx) const481 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
482 SyncScope Scope,
483 llvm::AtomicOrdering Ordering,
484 llvm::LLVMContext &Ctx) const {
485 std::string Name;
486 switch (Scope) {
487 case SyncScope::HIPSingleThread:
488 case SyncScope::SingleScope:
489 Name = "singlethread";
490 break;
491 case SyncScope::HIPWavefront:
492 case SyncScope::OpenCLSubGroup:
493 case SyncScope::WavefrontScope:
494 Name = "wavefront";
495 break;
496 case SyncScope::HIPWorkgroup:
497 case SyncScope::OpenCLWorkGroup:
498 case SyncScope::WorkgroupScope:
499 Name = "workgroup";
500 break;
501 case SyncScope::HIPAgent:
502 case SyncScope::OpenCLDevice:
503 case SyncScope::DeviceScope:
504 Name = "agent";
505 break;
506 case SyncScope::SystemScope:
507 case SyncScope::HIPSystem:
508 case SyncScope::OpenCLAllSVMDevices:
509 Name = "";
510 break;
511 }
512
513 // OpenCL assumes by default that atomic scopes are per-address space for
514 // non-sequentially consistent operations.
515 if (Scope >= SyncScope::OpenCLWorkGroup &&
516 Scope <= SyncScope::OpenCLSubGroup &&
517 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
518 if (!Name.empty())
519 Name = Twine(Twine(Name) + Twine("-")).str();
520
521 Name = Twine(Twine(Name) + Twine("one-as")).str();
522 }
523
524 return Ctx.getOrInsertSyncScopeID(Name);
525 }
526
setTargetAtomicMetadata(CodeGenFunction & CGF,llvm::Instruction & AtomicInst,const AtomicExpr * AE) const527 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
528 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
529 const AtomicExpr *AE) const {
530 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
531 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
532
533 // OpenCL and old style HIP atomics consider atomics targeting thread private
534 // memory to be undefined.
535 //
536 // TODO: This is probably undefined for atomic load/store, but there's not
537 // much direct codegen benefit to knowing this.
538 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
539 (CmpX &&
540 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
541 AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
542 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
543 llvm::MDNode *ASRange = MDHelper.createRange(
544 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
545 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
546 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
547 }
548
549 if (!RMW)
550 return;
551
552 AtomicOptions AO = CGF.CGM.getAtomicOpts();
553 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
554 if (!AO.getOption(clang::AtomicOptionKind::FineGrainedMemory))
555 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
556 if (!AO.getOption(clang::AtomicOptionKind::RemoteMemory))
557 RMW->setMetadata("amdgpu.no.remote.memory", Empty);
558 if (AO.getOption(clang::AtomicOptionKind::IgnoreDenormalMode) &&
559 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
560 RMW->getType()->isFloatTy())
561 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
562 }
563
shouldEmitStaticExternCAliases() const564 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
565 return false;
566 }
567
shouldEmitDWARFBitFieldSeparators() const568 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
569 return true;
570 }
571
setCUDAKernelCallingConvention(const FunctionType * & FT) const572 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
573 const FunctionType *&FT) const {
574 FT = getABIInfo().getContext().adjustFunctionType(
575 FT, FT->getExtInfo().withCallingConv(CC_DeviceKernel));
576 }
577
578 /// Return IR struct type for rtinfo struct in rocm-device-libs used for device
579 /// enqueue.
580 ///
581 /// ptr addrspace(1) kernel_object, i32 private_segment_size,
582 /// i32 group_segment_size
583
584 static llvm::StructType *
getAMDGPURuntimeHandleType(llvm::LLVMContext & C,llvm::Type * KernelDescriptorPtrTy)585 getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
586 llvm::Type *KernelDescriptorPtrTy) {
587 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
588 return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
589 "block.runtime.handle.t");
590 }
591
592 /// Create an OpenCL kernel for an enqueued block.
593 ///
594 /// The type of the first argument (the block literal) is the struct type
595 /// of the block literal instead of a pointer type. The first argument
596 /// (block literal) is passed directly by value to the kernel. The kernel
597 /// allocates the same type of struct on stack and stores the block literal
598 /// to it and passes its pointer to the block invoke function. The kernel
599 /// has "enqueued-block" function attribute and kernel argument metadata.
createEnqueuedBlockKernel(CodeGenFunction & CGF,llvm::Function * Invoke,llvm::Type * BlockTy) const600 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
601 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
602 auto &Builder = CGF.Builder;
603 auto &C = CGF.getLLVMContext();
604
605 auto *InvokeFT = Invoke->getFunctionType();
606 llvm::SmallVector<llvm::Type *, 2> ArgTys;
607 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
608 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
609 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
610 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
611 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
612 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
613
614 ArgTys.push_back(BlockTy);
615 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
616 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
617 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
618 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
619 AccessQuals.push_back(llvm::MDString::get(C, "none"));
620 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
621 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
622 ArgTys.push_back(InvokeFT->getParamType(I));
623 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
624 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
625 AccessQuals.push_back(llvm::MDString::get(C, "none"));
626 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
627 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
628 ArgNames.push_back(
629 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
630 }
631
632 llvm::Module &Mod = CGF.CGM.getModule();
633 const llvm::DataLayout &DL = Mod.getDataLayout();
634
635 llvm::Twine Name = Invoke->getName() + "_kernel";
636 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
637
638 // The kernel itself can be internal, the runtime does not directly access the
639 // kernel address (only the kernel descriptor).
640 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
641 &Mod);
642 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
643
644 llvm::AttrBuilder KernelAttrs(C);
645 // FIXME: The invoke isn't applying the right attributes either
646 // FIXME: This is missing setTargetAttributes
647 CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
648 F->addFnAttrs(KernelAttrs);
649
650 auto IP = CGF.Builder.saveIP();
651 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
652 Builder.SetInsertPoint(BB);
653 const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
654 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
655 BlockPtr->setAlignment(BlockAlign);
656 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
657 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
658 llvm::SmallVector<llvm::Value *, 2> Args;
659 Args.push_back(Cast);
660 for (llvm::Argument &A : llvm::drop_begin(F->args()))
661 Args.push_back(&A);
662 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
663 call->setCallingConv(Invoke->getCallingConv());
664 Builder.CreateRetVoid();
665 Builder.restoreIP(IP);
666
667 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
668 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
669 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
670 F->setMetadata("kernel_arg_base_type",
671 llvm::MDNode::get(C, ArgBaseTypeNames));
672 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
673 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
674 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
675
676 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
677 C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
678 llvm::Constant *RuntimeHandleInitializer =
679 llvm::ConstantAggregateZero::get(HandleTy);
680
681 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
682
683 // The runtime needs access to the runtime handle as an external symbol. The
684 // runtime handle will need to be made external later, in
685 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
686 // inside the runtime handle, and is not directly referenced.
687
688 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
689 // to reference the kernel descriptor. The runtime wouldn't need to bother
690 // setting it. We would need to have a final symbol name though.
691 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
692 auto *RuntimeHandle = new llvm::GlobalVariable(
693 Mod, HandleTy,
694 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
695 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
696 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
697 DL.getDefaultGlobalsAddressSpace(),
698 /*isExternallyInitialized=*/true);
699
700 llvm::MDNode *HandleAsMD =
701 llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
702 F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
703
704 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
705
706 CGF.CGM.addUsedGlobal(F);
707 CGF.CGM.addUsedGlobal(RuntimeHandle);
708 return RuntimeHandle;
709 }
710
handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function * F,const AMDGPUFlatWorkGroupSizeAttr * FlatWGS,const ReqdWorkGroupSizeAttr * ReqdWGS,int32_t * MinThreadsVal,int32_t * MaxThreadsVal)711 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
712 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
713 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
714 int32_t *MaxThreadsVal) {
715 unsigned Min = 0;
716 unsigned Max = 0;
717 auto Eval = [&](Expr *E) {
718 return E->EvaluateKnownConstInt(getContext()).getExtValue();
719 };
720 if (FlatWGS) {
721 Min = Eval(FlatWGS->getMin());
722 Max = Eval(FlatWGS->getMax());
723 }
724 if (ReqdWGS && Min == 0 && Max == 0)
725 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
726 Eval(ReqdWGS->getZDim());
727
728 if (Min != 0) {
729 assert(Min <= Max && "Min must be less than or equal Max");
730
731 if (MinThreadsVal)
732 *MinThreadsVal = Min;
733 if (MaxThreadsVal)
734 *MaxThreadsVal = Max;
735 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
736 if (F)
737 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
738 } else
739 assert(Max == 0 && "Max must be zero");
740 }
741
handleAMDGPUWavesPerEUAttr(llvm::Function * F,const AMDGPUWavesPerEUAttr * Attr)742 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
743 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
744 unsigned Min =
745 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
746 unsigned Max =
747 Attr->getMax()
748 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
749 : 0;
750
751 if (Min != 0) {
752 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
753
754 std::string AttrVal = llvm::utostr(Min);
755 if (Max != 0)
756 AttrVal = AttrVal + "," + llvm::utostr(Max);
757 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
758 } else
759 assert(Max == 0 && "Max must be zero");
760 }
761
762 std::unique_ptr<TargetCodeGenInfo>
createAMDGPUTargetCodeGenInfo(CodeGenModule & CGM)763 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
764 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
765 }
766