xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "clang/Basic/TargetOptions.h"
12 
13 using namespace clang;
14 using namespace clang::CodeGen;
15 
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
19 
20 namespace {
21 
22 class AMDGPUABIInfo final : public DefaultABIInfo {
23 private:
24   static const unsigned MaxNumRegsForArgsRet = 16;
25 
26   unsigned numRegsForType(QualType Ty) const;
27 
28   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29   bool isHomogeneousAggregateSmallEnough(const Type *Base,
30                                          uint64_t Members) const override;
31 
32   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34                                        unsigned ToAS) const {
35     // Single value types.
36     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38       return llvm::PointerType::get(Ty->getContext(), ToAS);
39     return Ty;
40   }
41 
42 public:
43   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44     DefaultABIInfo(CGT) {}
45 
46   ABIArgInfo classifyReturnType(QualType RetTy) const;
47   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48   ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
49                                   unsigned &NumRegsLeft) const;
50 
51   void computeInfo(CGFunctionInfo &FI) const override;
52   RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
53                    AggValueSlot Slot) const override;
54 };
55 
56 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
57   return true;
58 }
59 
60 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
61   const Type *Base, uint64_t Members) const {
62   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
63 
64   // Homogeneous Aggregates may occupy at most 16 registers.
65   return Members * NumRegs <= MaxNumRegsForArgsRet;
66 }
67 
68 /// Estimate number of registers the type will use when passed in registers.
69 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
70   unsigned NumRegs = 0;
71 
72   if (const VectorType *VT = Ty->getAs<VectorType>()) {
73     // Compute from the number of elements. The reported size is based on the
74     // in-memory size, which includes the padding 4th element for 3-vectors.
75     QualType EltTy = VT->getElementType();
76     unsigned EltSize = getContext().getTypeSize(EltTy);
77 
78     // 16-bit element vectors should be passed as packed.
79     if (EltSize == 16)
80       return (VT->getNumElements() + 1) / 2;
81 
82     unsigned EltNumRegs = (EltSize + 31) / 32;
83     return EltNumRegs * VT->getNumElements();
84   }
85 
86   if (const RecordType *RT = Ty->getAs<RecordType>()) {
87     const RecordDecl *RD = RT->getDecl();
88     assert(!RD->hasFlexibleArrayMember());
89 
90     for (const FieldDecl *Field : RD->fields()) {
91       QualType FieldTy = Field->getType();
92       NumRegs += numRegsForType(FieldTy);
93     }
94 
95     return NumRegs;
96   }
97 
98   return (getContext().getTypeSize(Ty) + 31) / 32;
99 }
100 
101 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
102   llvm::CallingConv::ID CC = FI.getCallingConvention();
103 
104   if (!getCXXABI().classifyReturnType(FI))
105     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
106 
107   unsigned ArgumentIndex = 0;
108   const unsigned numFixedArguments = FI.getNumRequiredArgs();
109 
110   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
111   for (auto &Arg : FI.arguments()) {
112     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
113       Arg.info = classifyKernelArgumentType(Arg.type);
114     } else {
115       bool FixedArgument = ArgumentIndex++ < numFixedArguments;
116       Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
117     }
118   }
119 }
120 
121 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
122                                 QualType Ty, AggValueSlot Slot) const {
123   const bool IsIndirect = false;
124   const bool AllowHigherAlign = false;
125   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
126                           getContext().getTypeInfoInChars(Ty),
127                           CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
128 }
129 
130 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
131   if (isAggregateTypeForABI(RetTy)) {
132     // Records with non-trivial destructors/copy-constructors should not be
133     // returned by value.
134     if (!getRecordArgABI(RetTy, getCXXABI())) {
135       // Ignore empty structs/unions.
136       if (isEmptyRecord(getContext(), RetTy, true))
137         return ABIArgInfo::getIgnore();
138 
139       // Lower single-element structs to just return a regular value.
140       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
141         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
142 
143       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
144         const RecordDecl *RD = RT->getDecl();
145         if (RD->hasFlexibleArrayMember())
146           return DefaultABIInfo::classifyReturnType(RetTy);
147       }
148 
149       // Pack aggregates <= 4 bytes into single VGPR or pair.
150       uint64_t Size = getContext().getTypeSize(RetTy);
151       if (Size <= 16)
152         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
153 
154       if (Size <= 32)
155         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
156 
157       if (Size <= 64) {
158         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
159         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
160       }
161 
162       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
163         return ABIArgInfo::getDirect();
164     }
165   }
166 
167   // Otherwise just do the default thing.
168   return DefaultABIInfo::classifyReturnType(RetTy);
169 }
170 
171 /// For kernels all parameters are really passed in a special buffer. It doesn't
172 /// make sense to pass anything byval, so everything must be direct.
173 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
174   Ty = useFirstFieldIfTransparentUnion(Ty);
175 
176   // TODO: Can we omit empty structs?
177 
178   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
179     Ty = QualType(SeltTy, 0);
180 
181   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
182   llvm::Type *LTy = OrigLTy;
183   if (getContext().getLangOpts().HIP) {
184     LTy = coerceKernelArgumentType(
185         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
186         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
187   }
188 
189   // FIXME: Should also use this for OpenCL, but it requires addressing the
190   // problem of kernels being called.
191   //
192   // FIXME: This doesn't apply the optimization of coercing pointers in structs
193   // to global address space when using byref. This would require implementing a
194   // new kind of coercion of the in-memory type when for indirect arguments.
195   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
196       isAggregateTypeForABI(Ty)) {
197     return ABIArgInfo::getIndirectAliased(
198         getContext().getTypeAlignInChars(Ty),
199         getContext().getTargetAddressSpace(LangAS::opencl_constant),
200         false /*Realign*/, nullptr /*Padding*/);
201   }
202 
203   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
204   // individual elements, which confuses the Clover OpenCL backend; therefore we
205   // have to set it to false here. Other args of getDirect() are just defaults.
206   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
207 }
208 
209 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
210                                                unsigned &NumRegsLeft) const {
211   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
212 
213   Ty = useFirstFieldIfTransparentUnion(Ty);
214 
215   if (Variadic) {
216     return ABIArgInfo::getDirect(/*T=*/nullptr,
217                                  /*Offset=*/0,
218                                  /*Padding=*/nullptr,
219                                  /*CanBeFlattened=*/false,
220                                  /*Align=*/0);
221   }
222 
223   if (isAggregateTypeForABI(Ty)) {
224     // Records with non-trivial destructors/copy-constructors should not be
225     // passed by value.
226     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
227       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
228 
229     // Ignore empty structs/unions.
230     if (isEmptyRecord(getContext(), Ty, true))
231       return ABIArgInfo::getIgnore();
232 
233     // Lower single-element structs to just pass a regular value. TODO: We
234     // could do reasonable-size multiple-element structs too, using getExpand(),
235     // though watch out for things like bitfields.
236     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
237       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
238 
239     if (const RecordType *RT = Ty->getAs<RecordType>()) {
240       const RecordDecl *RD = RT->getDecl();
241       if (RD->hasFlexibleArrayMember())
242         return DefaultABIInfo::classifyArgumentType(Ty);
243     }
244 
245     // Pack aggregates <= 8 bytes into single VGPR or pair.
246     uint64_t Size = getContext().getTypeSize(Ty);
247     if (Size <= 64) {
248       unsigned NumRegs = (Size + 31) / 32;
249       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
250 
251       if (Size <= 16)
252         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
253 
254       if (Size <= 32)
255         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
256 
257       // XXX: Should this be i64 instead, and should the limit increase?
258       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
259       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
260     }
261 
262     if (NumRegsLeft > 0) {
263       unsigned NumRegs = numRegsForType(Ty);
264       if (NumRegsLeft >= NumRegs) {
265         NumRegsLeft -= NumRegs;
266         return ABIArgInfo::getDirect();
267       }
268     }
269 
270     // Use pass-by-reference in stead of pass-by-value for struct arguments in
271     // function ABI.
272     return ABIArgInfo::getIndirectAliased(
273         getContext().getTypeAlignInChars(Ty),
274         getContext().getTargetAddressSpace(LangAS::opencl_private));
275   }
276 
277   // Otherwise just do the default thing.
278   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
279   if (!ArgInfo.isIndirect()) {
280     unsigned NumRegs = numRegsForType(Ty);
281     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
282   }
283 
284   return ArgInfo;
285 }
286 
287 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
288 public:
289   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
290       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
291 
292   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
293                                  CodeGenModule &CGM) const;
294 
295   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
296 
297   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
298                            CodeGen::CodeGenModule &M) const override;
299   unsigned getOpenCLKernelCallingConv() const override;
300 
301   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
302       llvm::PointerType *T, QualType QT) const override;
303 
304   LangAS getASTAllocaAddressSpace() const override {
305     return getLangASFromTargetAS(
306         getABIInfo().getDataLayout().getAllocaAddrSpace());
307   }
308   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
309                                   const VarDecl *D) const override;
310   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
311                                          SyncScope Scope,
312                                          llvm::AtomicOrdering Ordering,
313                                          llvm::LLVMContext &Ctx) const override;
314   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
315                                          llvm::Function *BlockInvokeFunc,
316                                          llvm::Type *BlockTy) const override;
317   bool shouldEmitStaticExternCAliases() const override;
318   bool shouldEmitDWARFBitFieldSeparators() const override;
319   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
320 };
321 }
322 
323 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
324                                               llvm::GlobalValue *GV) {
325   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
326     return false;
327 
328   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
329          (D->hasAttr<OpenCLKernelAttr>() ||
330           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
331           (isa<VarDecl>(D) &&
332            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
333             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
334             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
335 }
336 
337 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
338     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
339   const auto *ReqdWGS =
340       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
341   const bool IsOpenCLKernel =
342       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
343   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
344 
345   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
346   if (ReqdWGS || FlatWGS) {
347     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
348   } else if (IsOpenCLKernel || IsHIPKernel) {
349     // By default, restrict the maximum size to a value specified by
350     // --gpu-max-threads-per-block=n or its default value for HIP.
351     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
352     const unsigned DefaultMaxWorkGroupSize =
353         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
354                        : M.getLangOpts().GPUMaxThreadsPerBlock;
355     std::string AttrVal =
356         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
357     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
358   }
359 
360   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
361     M.handleAMDGPUWavesPerEUAttr(F, Attr);
362 
363   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
364     unsigned NumSGPR = Attr->getNumSGPR();
365 
366     if (NumSGPR != 0)
367       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
368   }
369 
370   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
371     uint32_t NumVGPR = Attr->getNumVGPR();
372 
373     if (NumVGPR != 0)
374       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
375   }
376 
377   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
378     uint32_t X = Attr->getMaxNumWorkGroupsX()
379                      ->EvaluateKnownConstInt(M.getContext())
380                      .getExtValue();
381     // Y and Z dimensions default to 1 if not specified
382     uint32_t Y = Attr->getMaxNumWorkGroupsY()
383                      ? Attr->getMaxNumWorkGroupsY()
384                            ->EvaluateKnownConstInt(M.getContext())
385                            .getExtValue()
386                      : 1;
387     uint32_t Z = Attr->getMaxNumWorkGroupsZ()
388                      ? Attr->getMaxNumWorkGroupsZ()
389                            ->EvaluateKnownConstInt(M.getContext())
390                            .getExtValue()
391                      : 1;
392 
393     llvm::SmallString<32> AttrVal;
394     llvm::raw_svector_ostream OS(AttrVal);
395     OS << X << ',' << Y << ',' << Z;
396 
397     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
398   }
399 }
400 
401 /// Emits control constants used to change per-architecture behaviour in the
402 /// AMDGPU ROCm device libraries.
403 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
404     CodeGen::CodeGenModule &CGM) const {
405   StringRef Name = "__oclc_ABI_version";
406   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
407   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
408     return;
409 
410   if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
411       llvm::CodeObjectVersionKind::COV_None)
412     return;
413 
414   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
415   llvm::Constant *COV = llvm::ConstantInt::get(
416       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
417 
418   // It needs to be constant weak_odr without externally_initialized so that
419   // the load instuction can be eliminated by the IPSCCP.
420   auto *GV = new llvm::GlobalVariable(
421       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
422       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
423       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
424   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
425   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
426 
427   // Replace any external references to this variable with the new global.
428   if (OriginalGV) {
429     OriginalGV->replaceAllUsesWith(GV);
430     GV->takeName(OriginalGV);
431     OriginalGV->eraseFromParent();
432   }
433 }
434 
435 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
436     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
437   if (requiresAMDGPUProtectedVisibility(D, GV)) {
438     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
439     GV->setDSOLocal(true);
440   }
441 
442   if (GV->isDeclaration())
443     return;
444 
445   llvm::Function *F = dyn_cast<llvm::Function>(GV);
446   if (!F)
447     return;
448 
449   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
450   if (FD)
451     setFunctionDeclAttributes(FD, F, M);
452 
453   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
454     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
455 
456   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
457     F->addFnAttr("amdgpu-ieee", "false");
458 }
459 
460 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
461   return llvm::CallingConv::AMDGPU_KERNEL;
462 }
463 
464 // Currently LLVM assumes null pointers always have value 0,
465 // which results in incorrectly transformed IR. Therefore, instead of
466 // emitting null pointers in private and local address spaces, a null
467 // pointer in generic address space is emitted which is casted to a
468 // pointer in local or private address space.
469 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
470     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
471     QualType QT) const {
472   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
473     return llvm::ConstantPointerNull::get(PT);
474 
475   auto &Ctx = CGM.getContext();
476   auto NPT = llvm::PointerType::get(
477       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
478   return llvm::ConstantExpr::getAddrSpaceCast(
479       llvm::ConstantPointerNull::get(NPT), PT);
480 }
481 
482 LangAS
483 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
484                                                   const VarDecl *D) const {
485   assert(!CGM.getLangOpts().OpenCL &&
486          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
487          "Address space agnostic languages only");
488   LangAS DefaultGlobalAS = getLangASFromTargetAS(
489       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
490   if (!D)
491     return DefaultGlobalAS;
492 
493   LangAS AddrSpace = D->getType().getAddressSpace();
494   if (AddrSpace != LangAS::Default)
495     return AddrSpace;
496 
497   // Only promote to address space 4 if VarDecl has constant initialization.
498   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
499       D->hasConstantInitialization()) {
500     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
501       return *ConstAS;
502   }
503   return DefaultGlobalAS;
504 }
505 
506 llvm::SyncScope::ID
507 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
508                                             SyncScope Scope,
509                                             llvm::AtomicOrdering Ordering,
510                                             llvm::LLVMContext &Ctx) const {
511   std::string Name;
512   switch (Scope) {
513   case SyncScope::HIPSingleThread:
514   case SyncScope::SingleScope:
515     Name = "singlethread";
516     break;
517   case SyncScope::HIPWavefront:
518   case SyncScope::OpenCLSubGroup:
519   case SyncScope::WavefrontScope:
520     Name = "wavefront";
521     break;
522   case SyncScope::HIPWorkgroup:
523   case SyncScope::OpenCLWorkGroup:
524   case SyncScope::WorkgroupScope:
525     Name = "workgroup";
526     break;
527   case SyncScope::HIPAgent:
528   case SyncScope::OpenCLDevice:
529   case SyncScope::DeviceScope:
530     Name = "agent";
531     break;
532   case SyncScope::SystemScope:
533   case SyncScope::HIPSystem:
534   case SyncScope::OpenCLAllSVMDevices:
535     Name = "";
536     break;
537   }
538 
539   if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
540     if (!Name.empty())
541       Name = Twine(Twine(Name) + Twine("-")).str();
542 
543     Name = Twine(Twine(Name) + Twine("one-as")).str();
544   }
545 
546   return Ctx.getOrInsertSyncScopeID(Name);
547 }
548 
549 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
550   return false;
551 }
552 
553 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
554   return true;
555 }
556 
557 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
558     const FunctionType *&FT) const {
559   FT = getABIInfo().getContext().adjustFunctionType(
560       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
561 }
562 
563 /// Create an OpenCL kernel for an enqueued block.
564 ///
565 /// The type of the first argument (the block literal) is the struct type
566 /// of the block literal instead of a pointer type. The first argument
567 /// (block literal) is passed directly by value to the kernel. The kernel
568 /// allocates the same type of struct on stack and stores the block literal
569 /// to it and passes its pointer to the block invoke function. The kernel
570 /// has "enqueued-block" function attribute and kernel argument metadata.
571 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
572     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
573   auto &Builder = CGF.Builder;
574   auto &C = CGF.getLLVMContext();
575 
576   auto *InvokeFT = Invoke->getFunctionType();
577   llvm::SmallVector<llvm::Type *, 2> ArgTys;
578   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
579   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
580   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
581   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
582   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
583   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
584 
585   ArgTys.push_back(BlockTy);
586   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
587   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
588   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
589   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
590   AccessQuals.push_back(llvm::MDString::get(C, "none"));
591   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
592   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
593     ArgTys.push_back(InvokeFT->getParamType(I));
594     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
595     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
596     AccessQuals.push_back(llvm::MDString::get(C, "none"));
597     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
598     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
599     ArgNames.push_back(
600         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
601   }
602   std::string Name = Invoke->getName().str() + "_kernel";
603   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
604   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
605                                    &CGF.CGM.getModule());
606   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
607 
608   llvm::AttrBuilder KernelAttrs(C);
609   // FIXME: The invoke isn't applying the right attributes either
610   // FIXME: This is missing setTargetAttributes
611   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
612   KernelAttrs.addAttribute("enqueued-block");
613   F->addFnAttrs(KernelAttrs);
614 
615   auto IP = CGF.Builder.saveIP();
616   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
617   Builder.SetInsertPoint(BB);
618   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
619   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
620   BlockPtr->setAlignment(BlockAlign);
621   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
622   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
623   llvm::SmallVector<llvm::Value *, 2> Args;
624   Args.push_back(Cast);
625   for (llvm::Argument &A : llvm::drop_begin(F->args()))
626     Args.push_back(&A);
627   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
628   call->setCallingConv(Invoke->getCallingConv());
629   Builder.CreateRetVoid();
630   Builder.restoreIP(IP);
631 
632   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
633   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
634   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
635   F->setMetadata("kernel_arg_base_type",
636                  llvm::MDNode::get(C, ArgBaseTypeNames));
637   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
638   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
639     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
640 
641   return F;
642 }
643 
644 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
645     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
646     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
647     int32_t *MaxThreadsVal) {
648   unsigned Min = 0;
649   unsigned Max = 0;
650   if (FlatWGS) {
651     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
652     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
653   }
654   if (ReqdWGS && Min == 0 && Max == 0)
655     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
656 
657   if (Min != 0) {
658     assert(Min <= Max && "Min must be less than or equal Max");
659 
660     if (MinThreadsVal)
661       *MinThreadsVal = Min;
662     if (MaxThreadsVal)
663       *MaxThreadsVal = Max;
664     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
665     if (F)
666       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
667   } else
668     assert(Max == 0 && "Max must be zero");
669 }
670 
671 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
672     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
673   unsigned Min =
674       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
675   unsigned Max =
676       Attr->getMax()
677           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
678           : 0;
679 
680   if (Min != 0) {
681     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
682 
683     std::string AttrVal = llvm::utostr(Min);
684     if (Max != 0)
685       AttrVal = AttrVal + "," + llvm::utostr(Max);
686     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
687   } else
688     assert(Max == 0 && "Max must be zero");
689 }
690 
691 std::unique_ptr<TargetCodeGenInfo>
692 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
693   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
694 }
695