xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp (revision a64729f5077d77e13b9497cb33ecb3c82e606ee8)
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12 
13 using namespace clang;
14 using namespace clang::CodeGen;
15 
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
19 
20 namespace {
21 
22 class AMDGPUABIInfo final : public DefaultABIInfo {
23 private:
24   static const unsigned MaxNumRegsForArgsRet = 16;
25 
26   unsigned numRegsForType(QualType Ty) const;
27 
28   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29   bool isHomogeneousAggregateSmallEnough(const Type *Base,
30                                          uint64_t Members) const override;
31 
32   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34                                        unsigned ToAS) const {
35     // Single value types.
36     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38       return llvm::PointerType::get(Ty->getContext(), ToAS);
39     return Ty;
40   }
41 
42 public:
43   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44     DefaultABIInfo(CGT) {}
45 
46   ABIArgInfo classifyReturnType(QualType RetTy) const;
47   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48   ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
49 
50   void computeInfo(CGFunctionInfo &FI) const override;
51   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
52                     QualType Ty) const override;
53 };
54 
55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56   return true;
57 }
58 
59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60   const Type *Base, uint64_t Members) const {
61   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
62 
63   // Homogeneous Aggregates may occupy at most 16 registers.
64   return Members * NumRegs <= MaxNumRegsForArgsRet;
65 }
66 
67 /// Estimate number of registers the type will use when passed in registers.
68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69   unsigned NumRegs = 0;
70 
71   if (const VectorType *VT = Ty->getAs<VectorType>()) {
72     // Compute from the number of elements. The reported size is based on the
73     // in-memory size, which includes the padding 4th element for 3-vectors.
74     QualType EltTy = VT->getElementType();
75     unsigned EltSize = getContext().getTypeSize(EltTy);
76 
77     // 16-bit element vectors should be passed as packed.
78     if (EltSize == 16)
79       return (VT->getNumElements() + 1) / 2;
80 
81     unsigned EltNumRegs = (EltSize + 31) / 32;
82     return EltNumRegs * VT->getNumElements();
83   }
84 
85   if (const RecordType *RT = Ty->getAs<RecordType>()) {
86     const RecordDecl *RD = RT->getDecl();
87     assert(!RD->hasFlexibleArrayMember());
88 
89     for (const FieldDecl *Field : RD->fields()) {
90       QualType FieldTy = Field->getType();
91       NumRegs += numRegsForType(FieldTy);
92     }
93 
94     return NumRegs;
95   }
96 
97   return (getContext().getTypeSize(Ty) + 31) / 32;
98 }
99 
100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
101   llvm::CallingConv::ID CC = FI.getCallingConvention();
102 
103   if (!getCXXABI().classifyReturnType(FI))
104     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
105 
106   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107   for (auto &Arg : FI.arguments()) {
108     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109       Arg.info = classifyKernelArgumentType(Arg.type);
110     } else {
111       Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
112     }
113   }
114 }
115 
116 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117                                  QualType Ty) const {
118   llvm_unreachable("AMDGPU does not support varargs");
119 }
120 
121 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
122   if (isAggregateTypeForABI(RetTy)) {
123     // Records with non-trivial destructors/copy-constructors should not be
124     // returned by value.
125     if (!getRecordArgABI(RetTy, getCXXABI())) {
126       // Ignore empty structs/unions.
127       if (isEmptyRecord(getContext(), RetTy, true))
128         return ABIArgInfo::getIgnore();
129 
130       // Lower single-element structs to just return a regular value.
131       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
132         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
133 
134       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
135         const RecordDecl *RD = RT->getDecl();
136         if (RD->hasFlexibleArrayMember())
137           return DefaultABIInfo::classifyReturnType(RetTy);
138       }
139 
140       // Pack aggregates <= 4 bytes into single VGPR or pair.
141       uint64_t Size = getContext().getTypeSize(RetTy);
142       if (Size <= 16)
143         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
144 
145       if (Size <= 32)
146         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
147 
148       if (Size <= 64) {
149         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
150         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
151       }
152 
153       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
154         return ABIArgInfo::getDirect();
155     }
156   }
157 
158   // Otherwise just do the default thing.
159   return DefaultABIInfo::classifyReturnType(RetTy);
160 }
161 
162 /// For kernels all parameters are really passed in a special buffer. It doesn't
163 /// make sense to pass anything byval, so everything must be direct.
164 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
165   Ty = useFirstFieldIfTransparentUnion(Ty);
166 
167   // TODO: Can we omit empty structs?
168 
169   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
170     Ty = QualType(SeltTy, 0);
171 
172   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173   llvm::Type *LTy = OrigLTy;
174   if (getContext().getLangOpts().HIP) {
175     LTy = coerceKernelArgumentType(
176         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
177         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
178   }
179 
180   // FIXME: Should also use this for OpenCL, but it requires addressing the
181   // problem of kernels being called.
182   //
183   // FIXME: This doesn't apply the optimization of coercing pointers in structs
184   // to global address space when using byref. This would require implementing a
185   // new kind of coercion of the in-memory type when for indirect arguments.
186   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
187       isAggregateTypeForABI(Ty)) {
188     return ABIArgInfo::getIndirectAliased(
189         getContext().getTypeAlignInChars(Ty),
190         getContext().getTargetAddressSpace(LangAS::opencl_constant),
191         false /*Realign*/, nullptr /*Padding*/);
192   }
193 
194   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195   // individual elements, which confuses the Clover OpenCL backend; therefore we
196   // have to set it to false here. Other args of getDirect() are just defaults.
197   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
198 }
199 
200 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
201                                                unsigned &NumRegsLeft) const {
202   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
203 
204   Ty = useFirstFieldIfTransparentUnion(Ty);
205 
206   if (isAggregateTypeForABI(Ty)) {
207     // Records with non-trivial destructors/copy-constructors should not be
208     // passed by value.
209     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
210       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
211 
212     // Ignore empty structs/unions.
213     if (isEmptyRecord(getContext(), Ty, true))
214       return ABIArgInfo::getIgnore();
215 
216     // Lower single-element structs to just pass a regular value. TODO: We
217     // could do reasonable-size multiple-element structs too, using getExpand(),
218     // though watch out for things like bitfields.
219     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
220       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
221 
222     if (const RecordType *RT = Ty->getAs<RecordType>()) {
223       const RecordDecl *RD = RT->getDecl();
224       if (RD->hasFlexibleArrayMember())
225         return DefaultABIInfo::classifyArgumentType(Ty);
226     }
227 
228     // Pack aggregates <= 8 bytes into single VGPR or pair.
229     uint64_t Size = getContext().getTypeSize(Ty);
230     if (Size <= 64) {
231       unsigned NumRegs = (Size + 31) / 32;
232       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
233 
234       if (Size <= 16)
235         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
236 
237       if (Size <= 32)
238         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
239 
240       // XXX: Should this be i64 instead, and should the limit increase?
241       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
242       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
243     }
244 
245     if (NumRegsLeft > 0) {
246       unsigned NumRegs = numRegsForType(Ty);
247       if (NumRegsLeft >= NumRegs) {
248         NumRegsLeft -= NumRegs;
249         return ABIArgInfo::getDirect();
250       }
251     }
252 
253     // Use pass-by-reference in stead of pass-by-value for struct arguments in
254     // function ABI.
255     return ABIArgInfo::getIndirectAliased(
256         getContext().getTypeAlignInChars(Ty),
257         getContext().getTargetAddressSpace(LangAS::opencl_private));
258   }
259 
260   // Otherwise just do the default thing.
261   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
262   if (!ArgInfo.isIndirect()) {
263     unsigned NumRegs = numRegsForType(Ty);
264     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
265   }
266 
267   return ArgInfo;
268 }
269 
270 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271 public:
272   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
274 
275   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
276                                  CodeGenModule &CGM) const;
277 
278   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279 
280   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
281                            CodeGen::CodeGenModule &M) const override;
282   unsigned getOpenCLKernelCallingConv() const override;
283 
284   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
285       llvm::PointerType *T, QualType QT) const override;
286 
287   LangAS getASTAllocaAddressSpace() const override {
288     return getLangASFromTargetAS(
289         getABIInfo().getDataLayout().getAllocaAddrSpace());
290   }
291   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
292                                   const VarDecl *D) const override;
293   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
294                                          SyncScope Scope,
295                                          llvm::AtomicOrdering Ordering,
296                                          llvm::LLVMContext &Ctx) const override;
297   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
298                                          llvm::Function *BlockInvokeFunc,
299                                          llvm::Type *BlockTy) const override;
300   bool shouldEmitStaticExternCAliases() const override;
301   bool shouldEmitDWARFBitFieldSeparators() const override;
302   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
303 };
304 }
305 
306 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
307                                               llvm::GlobalValue *GV) {
308   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
309     return false;
310 
311   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
312          (D->hasAttr<OpenCLKernelAttr>() ||
313           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
314           (isa<VarDecl>(D) &&
315            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
316             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
317             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
318 }
319 
320 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
321     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
322   const auto *ReqdWGS =
323       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
324   const bool IsOpenCLKernel =
325       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
326   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
327 
328   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
329   if (ReqdWGS || FlatWGS) {
330     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
331   } else if (IsOpenCLKernel || IsHIPKernel) {
332     // By default, restrict the maximum size to a value specified by
333     // --gpu-max-threads-per-block=n or its default value for HIP.
334     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
335     const unsigned DefaultMaxWorkGroupSize =
336         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
337                        : M.getLangOpts().GPUMaxThreadsPerBlock;
338     std::string AttrVal =
339         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
340     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
341   }
342 
343   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
344     M.handleAMDGPUWavesPerEUAttr(F, Attr);
345 
346   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
347     unsigned NumSGPR = Attr->getNumSGPR();
348 
349     if (NumSGPR != 0)
350       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
351   }
352 
353   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
354     uint32_t NumVGPR = Attr->getNumVGPR();
355 
356     if (NumVGPR != 0)
357       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
358   }
359 }
360 
361 /// Emits control constants used to change per-architecture behaviour in the
362 /// AMDGPU ROCm device libraries.
363 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
364     CodeGen::CodeGenModule &CGM) const {
365   StringRef Name = "__oclc_ABI_version";
366   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
367   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
368     return;
369 
370   if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
371       llvm::CodeObjectVersionKind::COV_None)
372     return;
373 
374   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
375   llvm::Constant *COV = llvm::ConstantInt::get(
376       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
377 
378   // It needs to be constant weak_odr without externally_initialized so that
379   // the load instuction can be eliminated by the IPSCCP.
380   auto *GV = new llvm::GlobalVariable(
381       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
382       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
383       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
384   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
385   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
386 
387   // Replace any external references to this variable with the new global.
388   if (OriginalGV) {
389     OriginalGV->replaceAllUsesWith(GV);
390     GV->takeName(OriginalGV);
391     OriginalGV->eraseFromParent();
392   }
393 }
394 
395 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
396     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
397   if (requiresAMDGPUProtectedVisibility(D, GV)) {
398     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
399     GV->setDSOLocal(true);
400   }
401 
402   if (GV->isDeclaration())
403     return;
404 
405   llvm::Function *F = dyn_cast<llvm::Function>(GV);
406   if (!F)
407     return;
408 
409   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
410   if (FD)
411     setFunctionDeclAttributes(FD, F, M);
412 
413   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
414     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
415 
416   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
417     F->addFnAttr("amdgpu-ieee", "false");
418 }
419 
420 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
421   return llvm::CallingConv::AMDGPU_KERNEL;
422 }
423 
424 // Currently LLVM assumes null pointers always have value 0,
425 // which results in incorrectly transformed IR. Therefore, instead of
426 // emitting null pointers in private and local address spaces, a null
427 // pointer in generic address space is emitted which is casted to a
428 // pointer in local or private address space.
429 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
430     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
431     QualType QT) const {
432   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
433     return llvm::ConstantPointerNull::get(PT);
434 
435   auto &Ctx = CGM.getContext();
436   auto NPT = llvm::PointerType::get(
437       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
438   return llvm::ConstantExpr::getAddrSpaceCast(
439       llvm::ConstantPointerNull::get(NPT), PT);
440 }
441 
442 LangAS
443 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
444                                                   const VarDecl *D) const {
445   assert(!CGM.getLangOpts().OpenCL &&
446          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
447          "Address space agnostic languages only");
448   LangAS DefaultGlobalAS = getLangASFromTargetAS(
449       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
450   if (!D)
451     return DefaultGlobalAS;
452 
453   LangAS AddrSpace = D->getType().getAddressSpace();
454   if (AddrSpace != LangAS::Default)
455     return AddrSpace;
456 
457   // Only promote to address space 4 if VarDecl has constant initialization.
458   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
459       D->hasConstantInitialization()) {
460     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
461       return *ConstAS;
462   }
463   return DefaultGlobalAS;
464 }
465 
466 llvm::SyncScope::ID
467 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
468                                             SyncScope Scope,
469                                             llvm::AtomicOrdering Ordering,
470                                             llvm::LLVMContext &Ctx) const {
471   std::string Name;
472   switch (Scope) {
473   case SyncScope::HIPSingleThread:
474   case SyncScope::SingleScope:
475     Name = "singlethread";
476     break;
477   case SyncScope::HIPWavefront:
478   case SyncScope::OpenCLSubGroup:
479   case SyncScope::WavefrontScope:
480     Name = "wavefront";
481     break;
482   case SyncScope::HIPWorkgroup:
483   case SyncScope::OpenCLWorkGroup:
484   case SyncScope::WorkgroupScope:
485     Name = "workgroup";
486     break;
487   case SyncScope::HIPAgent:
488   case SyncScope::OpenCLDevice:
489   case SyncScope::DeviceScope:
490     Name = "agent";
491     break;
492   case SyncScope::SystemScope:
493   case SyncScope::HIPSystem:
494   case SyncScope::OpenCLAllSVMDevices:
495     Name = "";
496     break;
497   }
498 
499   if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
500     if (!Name.empty())
501       Name = Twine(Twine(Name) + Twine("-")).str();
502 
503     Name = Twine(Twine(Name) + Twine("one-as")).str();
504   }
505 
506   return Ctx.getOrInsertSyncScopeID(Name);
507 }
508 
509 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
510   return false;
511 }
512 
513 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
514   return true;
515 }
516 
517 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
518     const FunctionType *&FT) const {
519   FT = getABIInfo().getContext().adjustFunctionType(
520       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
521 }
522 
523 /// Create an OpenCL kernel for an enqueued block.
524 ///
525 /// The type of the first argument (the block literal) is the struct type
526 /// of the block literal instead of a pointer type. The first argument
527 /// (block literal) is passed directly by value to the kernel. The kernel
528 /// allocates the same type of struct on stack and stores the block literal
529 /// to it and passes its pointer to the block invoke function. The kernel
530 /// has "enqueued-block" function attribute and kernel argument metadata.
531 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
532     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
533   auto &Builder = CGF.Builder;
534   auto &C = CGF.getLLVMContext();
535 
536   auto *InvokeFT = Invoke->getFunctionType();
537   llvm::SmallVector<llvm::Type *, 2> ArgTys;
538   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
539   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
540   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
541   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
542   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
543   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
544 
545   ArgTys.push_back(BlockTy);
546   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
547   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
548   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
549   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
550   AccessQuals.push_back(llvm::MDString::get(C, "none"));
551   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
552   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
553     ArgTys.push_back(InvokeFT->getParamType(I));
554     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
555     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
556     AccessQuals.push_back(llvm::MDString::get(C, "none"));
557     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
558     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
559     ArgNames.push_back(
560         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
561   }
562   std::string Name = Invoke->getName().str() + "_kernel";
563   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
564   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
565                                    &CGF.CGM.getModule());
566   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
567 
568   llvm::AttrBuilder KernelAttrs(C);
569   // FIXME: The invoke isn't applying the right attributes either
570   // FIXME: This is missing setTargetAttributes
571   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
572   KernelAttrs.addAttribute("enqueued-block");
573   F->addFnAttrs(KernelAttrs);
574 
575   auto IP = CGF.Builder.saveIP();
576   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
577   Builder.SetInsertPoint(BB);
578   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
579   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
580   BlockPtr->setAlignment(BlockAlign);
581   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
582   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
583   llvm::SmallVector<llvm::Value *, 2> Args;
584   Args.push_back(Cast);
585   for (llvm::Argument &A : llvm::drop_begin(F->args()))
586     Args.push_back(&A);
587   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
588   call->setCallingConv(Invoke->getCallingConv());
589   Builder.CreateRetVoid();
590   Builder.restoreIP(IP);
591 
592   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
593   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
594   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
595   F->setMetadata("kernel_arg_base_type",
596                  llvm::MDNode::get(C, ArgBaseTypeNames));
597   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
598   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
599     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
600 
601   return F;
602 }
603 
604 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
605     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
606     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
607     int32_t *MaxThreadsVal) {
608   unsigned Min = 0;
609   unsigned Max = 0;
610   if (FlatWGS) {
611     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
612     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
613   }
614   if (ReqdWGS && Min == 0 && Max == 0)
615     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
616 
617   if (Min != 0) {
618     assert(Min <= Max && "Min must be less than or equal Max");
619 
620     if (MinThreadsVal)
621       *MinThreadsVal = Min;
622     if (MaxThreadsVal)
623       *MaxThreadsVal = Max;
624     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
625     if (F)
626       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
627   } else
628     assert(Max == 0 && "Max must be zero");
629 }
630 
631 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
632     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
633   unsigned Min =
634       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
635   unsigned Max =
636       Attr->getMax()
637           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
638           : 0;
639 
640   if (Min != 0) {
641     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
642 
643     std::string AttrVal = llvm::utostr(Min);
644     if (Max != 0)
645       AttrVal = AttrVal + "," + llvm::utostr(Max);
646     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
647   } else
648     assert(Max == 0 && "Max must be zero");
649 }
650 
651 std::unique_ptr<TargetCodeGenInfo>
652 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
653   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
654 }
655