xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp (revision a03411e84728e9b267056fd31c7d1d9d1dc1b01e)
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 
12 using namespace clang;
13 using namespace clang::CodeGen;
14 
15 //===----------------------------------------------------------------------===//
16 // AMDGPU ABI Implementation
17 //===----------------------------------------------------------------------===//
18 
19 namespace {
20 
21 class AMDGPUABIInfo final : public DefaultABIInfo {
22 private:
23   static const unsigned MaxNumRegsForArgsRet = 16;
24 
25   unsigned numRegsForType(QualType Ty) const;
26 
27   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
28   bool isHomogeneousAggregateSmallEnough(const Type *Base,
29                                          uint64_t Members) const override;
30 
31   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
32   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
33                                        unsigned ToAS) const {
34     // Single value types.
35     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
36     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
37       return llvm::PointerType::get(Ty->getContext(), ToAS);
38     return Ty;
39   }
40 
41 public:
42   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
43     DefaultABIInfo(CGT) {}
44 
45   ABIArgInfo classifyReturnType(QualType RetTy) const;
46   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
47   ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
48 
49   void computeInfo(CGFunctionInfo &FI) const override;
50   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
51                     QualType Ty) const override;
52 };
53 
54 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
55   return true;
56 }
57 
58 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
59   const Type *Base, uint64_t Members) const {
60   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
61 
62   // Homogeneous Aggregates may occupy at most 16 registers.
63   return Members * NumRegs <= MaxNumRegsForArgsRet;
64 }
65 
66 /// Estimate number of registers the type will use when passed in registers.
67 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
68   unsigned NumRegs = 0;
69 
70   if (const VectorType *VT = Ty->getAs<VectorType>()) {
71     // Compute from the number of elements. The reported size is based on the
72     // in-memory size, which includes the padding 4th element for 3-vectors.
73     QualType EltTy = VT->getElementType();
74     unsigned EltSize = getContext().getTypeSize(EltTy);
75 
76     // 16-bit element vectors should be passed as packed.
77     if (EltSize == 16)
78       return (VT->getNumElements() + 1) / 2;
79 
80     unsigned EltNumRegs = (EltSize + 31) / 32;
81     return EltNumRegs * VT->getNumElements();
82   }
83 
84   if (const RecordType *RT = Ty->getAs<RecordType>()) {
85     const RecordDecl *RD = RT->getDecl();
86     assert(!RD->hasFlexibleArrayMember());
87 
88     for (const FieldDecl *Field : RD->fields()) {
89       QualType FieldTy = Field->getType();
90       NumRegs += numRegsForType(FieldTy);
91     }
92 
93     return NumRegs;
94   }
95 
96   return (getContext().getTypeSize(Ty) + 31) / 32;
97 }
98 
99 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
100   llvm::CallingConv::ID CC = FI.getCallingConvention();
101 
102   if (!getCXXABI().classifyReturnType(FI))
103     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
104 
105   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
106   for (auto &Arg : FI.arguments()) {
107     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
108       Arg.info = classifyKernelArgumentType(Arg.type);
109     } else {
110       Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
111     }
112   }
113 }
114 
115 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
116                                  QualType Ty) const {
117   llvm_unreachable("AMDGPU does not support varargs");
118 }
119 
120 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
121   if (isAggregateTypeForABI(RetTy)) {
122     // Records with non-trivial destructors/copy-constructors should not be
123     // returned by value.
124     if (!getRecordArgABI(RetTy, getCXXABI())) {
125       // Ignore empty structs/unions.
126       if (isEmptyRecord(getContext(), RetTy, true))
127         return ABIArgInfo::getIgnore();
128 
129       // Lower single-element structs to just return a regular value.
130       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
131         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
132 
133       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
134         const RecordDecl *RD = RT->getDecl();
135         if (RD->hasFlexibleArrayMember())
136           return DefaultABIInfo::classifyReturnType(RetTy);
137       }
138 
139       // Pack aggregates <= 4 bytes into single VGPR or pair.
140       uint64_t Size = getContext().getTypeSize(RetTy);
141       if (Size <= 16)
142         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
143 
144       if (Size <= 32)
145         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
146 
147       if (Size <= 64) {
148         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
149         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
150       }
151 
152       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
153         return ABIArgInfo::getDirect();
154     }
155   }
156 
157   // Otherwise just do the default thing.
158   return DefaultABIInfo::classifyReturnType(RetTy);
159 }
160 
161 /// For kernels all parameters are really passed in a special buffer. It doesn't
162 /// make sense to pass anything byval, so everything must be direct.
163 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
164   Ty = useFirstFieldIfTransparentUnion(Ty);
165 
166   // TODO: Can we omit empty structs?
167 
168   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
169     Ty = QualType(SeltTy, 0);
170 
171   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
172   llvm::Type *LTy = OrigLTy;
173   if (getContext().getLangOpts().HIP) {
174     LTy = coerceKernelArgumentType(
175         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
176         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
177   }
178 
179   // FIXME: Should also use this for OpenCL, but it requires addressing the
180   // problem of kernels being called.
181   //
182   // FIXME: This doesn't apply the optimization of coercing pointers in structs
183   // to global address space when using byref. This would require implementing a
184   // new kind of coercion of the in-memory type when for indirect arguments.
185   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
186       isAggregateTypeForABI(Ty)) {
187     return ABIArgInfo::getIndirectAliased(
188         getContext().getTypeAlignInChars(Ty),
189         getContext().getTargetAddressSpace(LangAS::opencl_constant),
190         false /*Realign*/, nullptr /*Padding*/);
191   }
192 
193   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
194   // individual elements, which confuses the Clover OpenCL backend; therefore we
195   // have to set it to false here. Other args of getDirect() are just defaults.
196   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
197 }
198 
199 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
200                                                unsigned &NumRegsLeft) const {
201   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
202 
203   Ty = useFirstFieldIfTransparentUnion(Ty);
204 
205   if (isAggregateTypeForABI(Ty)) {
206     // Records with non-trivial destructors/copy-constructors should not be
207     // passed by value.
208     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
209       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
210 
211     // Ignore empty structs/unions.
212     if (isEmptyRecord(getContext(), Ty, true))
213       return ABIArgInfo::getIgnore();
214 
215     // Lower single-element structs to just pass a regular value. TODO: We
216     // could do reasonable-size multiple-element structs too, using getExpand(),
217     // though watch out for things like bitfields.
218     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
219       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
220 
221     if (const RecordType *RT = Ty->getAs<RecordType>()) {
222       const RecordDecl *RD = RT->getDecl();
223       if (RD->hasFlexibleArrayMember())
224         return DefaultABIInfo::classifyArgumentType(Ty);
225     }
226 
227     // Pack aggregates <= 8 bytes into single VGPR or pair.
228     uint64_t Size = getContext().getTypeSize(Ty);
229     if (Size <= 64) {
230       unsigned NumRegs = (Size + 31) / 32;
231       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
232 
233       if (Size <= 16)
234         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
235 
236       if (Size <= 32)
237         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
238 
239       // XXX: Should this be i64 instead, and should the limit increase?
240       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
241       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
242     }
243 
244     if (NumRegsLeft > 0) {
245       unsigned NumRegs = numRegsForType(Ty);
246       if (NumRegsLeft >= NumRegs) {
247         NumRegsLeft -= NumRegs;
248         return ABIArgInfo::getDirect();
249       }
250     }
251   }
252 
253   // Otherwise just do the default thing.
254   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
255   if (!ArgInfo.isIndirect()) {
256     unsigned NumRegs = numRegsForType(Ty);
257     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
258   }
259 
260   return ArgInfo;
261 }
262 
263 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
264 public:
265   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
266       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
267 
268   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
269                                  CodeGenModule &CGM) const;
270 
271   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
272                            CodeGen::CodeGenModule &M) const override;
273   unsigned getOpenCLKernelCallingConv() const override;
274 
275   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
276       llvm::PointerType *T, QualType QT) const override;
277 
278   LangAS getASTAllocaAddressSpace() const override {
279     return getLangASFromTargetAS(
280         getABIInfo().getDataLayout().getAllocaAddrSpace());
281   }
282   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
283                                   const VarDecl *D) const override;
284   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
285                                          SyncScope Scope,
286                                          llvm::AtomicOrdering Ordering,
287                                          llvm::LLVMContext &Ctx) const override;
288   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
289                                          llvm::Function *BlockInvokeFunc,
290                                          llvm::Type *BlockTy) const override;
291   bool shouldEmitStaticExternCAliases() const override;
292   bool shouldEmitDWARFBitFieldSeparators() const override;
293   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
294 };
295 }
296 
297 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
298                                               llvm::GlobalValue *GV) {
299   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
300     return false;
301 
302   return D->hasAttr<OpenCLKernelAttr>() ||
303          (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
304          (isa<VarDecl>(D) &&
305           (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
306            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
307            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
308 }
309 
310 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
311     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
312   const auto *ReqdWGS =
313       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
314   const bool IsOpenCLKernel =
315       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
316   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
317 
318   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
319   if (ReqdWGS || FlatWGS) {
320     unsigned Min = 0;
321     unsigned Max = 0;
322     if (FlatWGS) {
323       Min = FlatWGS->getMin()
324                 ->EvaluateKnownConstInt(M.getContext())
325                 .getExtValue();
326       Max = FlatWGS->getMax()
327                 ->EvaluateKnownConstInt(M.getContext())
328                 .getExtValue();
329     }
330     if (ReqdWGS && Min == 0 && Max == 0)
331       Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
332 
333     if (Min != 0) {
334       assert(Min <= Max && "Min must be less than or equal Max");
335 
336       std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
337       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
338     } else
339       assert(Max == 0 && "Max must be zero");
340   } else if (IsOpenCLKernel || IsHIPKernel) {
341     // By default, restrict the maximum size to a value specified by
342     // --gpu-max-threads-per-block=n or its default value for HIP.
343     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
344     const unsigned DefaultMaxWorkGroupSize =
345         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
346                        : M.getLangOpts().GPUMaxThreadsPerBlock;
347     std::string AttrVal =
348         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
349     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
350   }
351 
352   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) {
353     unsigned Min =
354         Attr->getMin()->EvaluateKnownConstInt(M.getContext()).getExtValue();
355     unsigned Max = Attr->getMax() ? Attr->getMax()
356                                         ->EvaluateKnownConstInt(M.getContext())
357                                         .getExtValue()
358                                   : 0;
359 
360     if (Min != 0) {
361       assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
362 
363       std::string AttrVal = llvm::utostr(Min);
364       if (Max != 0)
365         AttrVal = AttrVal + "," + llvm::utostr(Max);
366       F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
367     } else
368       assert(Max == 0 && "Max must be zero");
369   }
370 
371   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
372     unsigned NumSGPR = Attr->getNumSGPR();
373 
374     if (NumSGPR != 0)
375       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
376   }
377 
378   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
379     uint32_t NumVGPR = Attr->getNumVGPR();
380 
381     if (NumVGPR != 0)
382       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
383   }
384 }
385 
386 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
387     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
388   if (requiresAMDGPUProtectedVisibility(D, GV)) {
389     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
390     GV->setDSOLocal(true);
391   }
392 
393   if (GV->isDeclaration())
394     return;
395 
396   llvm::Function *F = dyn_cast<llvm::Function>(GV);
397   if (!F)
398     return;
399 
400   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
401   if (FD)
402     setFunctionDeclAttributes(FD, F, M);
403 
404   const bool IsHIPKernel =
405       M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();
406 
407   // TODO: This should be moved to language specific attributes instead.
408   if (IsHIPKernel)
409     F->addFnAttr("uniform-work-group-size", "true");
410 
411   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
412     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
413 
414   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
415     F->addFnAttr("amdgpu-ieee", "false");
416 }
417 
418 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
419   return llvm::CallingConv::AMDGPU_KERNEL;
420 }
421 
422 // Currently LLVM assumes null pointers always have value 0,
423 // which results in incorrectly transformed IR. Therefore, instead of
424 // emitting null pointers in private and local address spaces, a null
425 // pointer in generic address space is emitted which is casted to a
426 // pointer in local or private address space.
427 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
428     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
429     QualType QT) const {
430   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
431     return llvm::ConstantPointerNull::get(PT);
432 
433   auto &Ctx = CGM.getContext();
434   auto NPT = llvm::PointerType::get(
435       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
436   return llvm::ConstantExpr::getAddrSpaceCast(
437       llvm::ConstantPointerNull::get(NPT), PT);
438 }
439 
440 LangAS
441 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
442                                                   const VarDecl *D) const {
443   assert(!CGM.getLangOpts().OpenCL &&
444          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
445          "Address space agnostic languages only");
446   LangAS DefaultGlobalAS = getLangASFromTargetAS(
447       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
448   if (!D)
449     return DefaultGlobalAS;
450 
451   LangAS AddrSpace = D->getType().getAddressSpace();
452   assert(AddrSpace == LangAS::Default || isTargetAddressSpace(AddrSpace));
453   if (AddrSpace != LangAS::Default)
454     return AddrSpace;
455 
456   // Only promote to address space 4 if VarDecl has constant initialization.
457   if (CGM.isTypeConstant(D->getType(), false, false) &&
458       D->hasConstantInitialization()) {
459     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
460       return *ConstAS;
461   }
462   return DefaultGlobalAS;
463 }
464 
465 llvm::SyncScope::ID
466 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
467                                             SyncScope Scope,
468                                             llvm::AtomicOrdering Ordering,
469                                             llvm::LLVMContext &Ctx) const {
470   std::string Name;
471   switch (Scope) {
472   case SyncScope::HIPSingleThread:
473     Name = "singlethread";
474     break;
475   case SyncScope::HIPWavefront:
476   case SyncScope::OpenCLSubGroup:
477     Name = "wavefront";
478     break;
479   case SyncScope::HIPWorkgroup:
480   case SyncScope::OpenCLWorkGroup:
481     Name = "workgroup";
482     break;
483   case SyncScope::HIPAgent:
484   case SyncScope::OpenCLDevice:
485     Name = "agent";
486     break;
487   case SyncScope::HIPSystem:
488   case SyncScope::OpenCLAllSVMDevices:
489     Name = "";
490     break;
491   }
492 
493   if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
494     if (!Name.empty())
495       Name = Twine(Twine(Name) + Twine("-")).str();
496 
497     Name = Twine(Twine(Name) + Twine("one-as")).str();
498   }
499 
500   return Ctx.getOrInsertSyncScopeID(Name);
501 }
502 
503 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
504   return false;
505 }
506 
507 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
508   return true;
509 }
510 
511 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
512     const FunctionType *&FT) const {
513   FT = getABIInfo().getContext().adjustFunctionType(
514       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
515 }
516 
517 /// Create an OpenCL kernel for an enqueued block.
518 ///
519 /// The type of the first argument (the block literal) is the struct type
520 /// of the block literal instead of a pointer type. The first argument
521 /// (block literal) is passed directly by value to the kernel. The kernel
522 /// allocates the same type of struct on stack and stores the block literal
523 /// to it and passes its pointer to the block invoke function. The kernel
524 /// has "enqueued-block" function attribute and kernel argument metadata.
525 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
526     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
527   auto &Builder = CGF.Builder;
528   auto &C = CGF.getLLVMContext();
529 
530   auto *InvokeFT = Invoke->getFunctionType();
531   llvm::SmallVector<llvm::Type *, 2> ArgTys;
532   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
533   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
534   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
535   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
536   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
537   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
538 
539   ArgTys.push_back(BlockTy);
540   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
541   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
542   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
543   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
544   AccessQuals.push_back(llvm::MDString::get(C, "none"));
545   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
546   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
547     ArgTys.push_back(InvokeFT->getParamType(I));
548     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
549     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
550     AccessQuals.push_back(llvm::MDString::get(C, "none"));
551     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
552     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
553     ArgNames.push_back(
554         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
555   }
556   std::string Name = Invoke->getName().str() + "_kernel";
557   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
558   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
559                                    &CGF.CGM.getModule());
560   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
561 
562   llvm::AttrBuilder KernelAttrs(C);
563   // FIXME: The invoke isn't applying the right attributes either
564   // FIXME: This is missing setTargetAttributes
565   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
566   KernelAttrs.addAttribute("enqueued-block");
567   F->addFnAttrs(KernelAttrs);
568 
569   auto IP = CGF.Builder.saveIP();
570   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
571   Builder.SetInsertPoint(BB);
572   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
573   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
574   BlockPtr->setAlignment(BlockAlign);
575   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
576   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
577   llvm::SmallVector<llvm::Value *, 2> Args;
578   Args.push_back(Cast);
579   for (llvm::Argument &A : llvm::drop_begin(F->args()))
580     Args.push_back(&A);
581   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
582   call->setCallingConv(Invoke->getCallingConv());
583   Builder.CreateRetVoid();
584   Builder.restoreIP(IP);
585 
586   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
587   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
588   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
589   F->setMetadata("kernel_arg_base_type",
590                  llvm::MDNode::get(C, ArgBaseTypeNames));
591   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
592   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
593     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
594 
595   return F;
596 }
597 
598 std::unique_ptr<TargetCodeGenInfo>
599 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
600   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
601 }
602