xref: /freebsd/contrib/llvm-project/clang/lib/CodeGen/Targets/AMDGPU.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
11 #include "llvm/ADT/StringExtras.h"
12 #include "llvm/Support/AMDGPUAddrSpace.h"
13 
14 using namespace clang;
15 using namespace clang::CodeGen;
16 
17 //===----------------------------------------------------------------------===//
18 // AMDGPU ABI Implementation
19 //===----------------------------------------------------------------------===//
20 
21 namespace {
22 
23 class AMDGPUABIInfo final : public DefaultABIInfo {
24 private:
25   static const unsigned MaxNumRegsForArgsRet = 16;
26 
27   unsigned numRegsForType(QualType Ty) const;
28 
29   bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30   bool isHomogeneousAggregateSmallEnough(const Type *Base,
31                                          uint64_t Members) const override;
32 
33   // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35                                        unsigned ToAS) const {
36     // Single value types.
37     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38     if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39       return llvm::PointerType::get(Ty->getContext(), ToAS);
40     return Ty;
41   }
42 
43 public:
44   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45     DefaultABIInfo(CGT) {}
46 
47   ABIArgInfo classifyReturnType(QualType RetTy) const;
48   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49   ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50                                   unsigned &NumRegsLeft) const;
51 
52   void computeInfo(CGFunctionInfo &FI) const override;
53   RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54                    AggValueSlot Slot) const override;
55 
56   llvm::FixedVectorType *
57   getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58                              const LangOptions &Opt) const override {
59     // We have legal instructions for 96-bit so 3x32 can be supported.
60     // FIXME: This check should be a subtarget feature as technically SI doesn't
61     // support it.
62     if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63       return T;
64     return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65   }
66 };
67 
68 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69   return true;
70 }
71 
72 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73   const Type *Base, uint64_t Members) const {
74   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75 
76   // Homogeneous Aggregates may occupy at most 16 registers.
77   return Members * NumRegs <= MaxNumRegsForArgsRet;
78 }
79 
80 /// Estimate number of registers the type will use when passed in registers.
81 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82   unsigned NumRegs = 0;
83 
84   if (const VectorType *VT = Ty->getAs<VectorType>()) {
85     // Compute from the number of elements. The reported size is based on the
86     // in-memory size, which includes the padding 4th element for 3-vectors.
87     QualType EltTy = VT->getElementType();
88     unsigned EltSize = getContext().getTypeSize(EltTy);
89 
90     // 16-bit element vectors should be passed as packed.
91     if (EltSize == 16)
92       return (VT->getNumElements() + 1) / 2;
93 
94     unsigned EltNumRegs = (EltSize + 31) / 32;
95     return EltNumRegs * VT->getNumElements();
96   }
97 
98   if (const RecordType *RT = Ty->getAs<RecordType>()) {
99     const RecordDecl *RD = RT->getDecl();
100     assert(!RD->hasFlexibleArrayMember());
101 
102     for (const FieldDecl *Field : RD->fields()) {
103       QualType FieldTy = Field->getType();
104       NumRegs += numRegsForType(FieldTy);
105     }
106 
107     return NumRegs;
108   }
109 
110   return (getContext().getTypeSize(Ty) + 31) / 32;
111 }
112 
113 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
114   llvm::CallingConv::ID CC = FI.getCallingConvention();
115 
116   if (!getCXXABI().classifyReturnType(FI))
117     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
118 
119   unsigned ArgumentIndex = 0;
120   const unsigned numFixedArguments = FI.getNumRequiredArgs();
121 
122   unsigned NumRegsLeft = MaxNumRegsForArgsRet;
123   for (auto &Arg : FI.arguments()) {
124     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
125       Arg.info = classifyKernelArgumentType(Arg.type);
126     } else {
127       bool FixedArgument = ArgumentIndex++ < numFixedArguments;
128       Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
129     }
130   }
131 }
132 
133 RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
134                                 QualType Ty, AggValueSlot Slot) const {
135   const bool IsIndirect = false;
136   const bool AllowHigherAlign = false;
137   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
138                           getContext().getTypeInfoInChars(Ty),
139                           CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
140 }
141 
142 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
143   if (isAggregateTypeForABI(RetTy)) {
144     // Records with non-trivial destructors/copy-constructors should not be
145     // returned by value.
146     if (!getRecordArgABI(RetTy, getCXXABI())) {
147       // Ignore empty structs/unions.
148       if (isEmptyRecord(getContext(), RetTy, true))
149         return ABIArgInfo::getIgnore();
150 
151       // Lower single-element structs to just return a regular value.
152       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
153         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
154 
155       if (const RecordType *RT = RetTy->getAs<RecordType>()) {
156         const RecordDecl *RD = RT->getDecl();
157         if (RD->hasFlexibleArrayMember())
158           return DefaultABIInfo::classifyReturnType(RetTy);
159       }
160 
161       // Pack aggregates <= 4 bytes into single VGPR or pair.
162       uint64_t Size = getContext().getTypeSize(RetTy);
163       if (Size <= 16)
164         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
165 
166       if (Size <= 32)
167         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
168 
169       if (Size <= 64) {
170         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
171         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
172       }
173 
174       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
175         return ABIArgInfo::getDirect();
176     }
177   }
178 
179   // Otherwise just do the default thing.
180   return DefaultABIInfo::classifyReturnType(RetTy);
181 }
182 
183 /// For kernels all parameters are really passed in a special buffer. It doesn't
184 /// make sense to pass anything byval, so everything must be direct.
185 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
186   Ty = useFirstFieldIfTransparentUnion(Ty);
187 
188   // TODO: Can we omit empty structs?
189 
190   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
191     Ty = QualType(SeltTy, 0);
192 
193   llvm::Type *OrigLTy = CGT.ConvertType(Ty);
194   llvm::Type *LTy = OrigLTy;
195   if (getContext().getLangOpts().HIP) {
196     LTy = coerceKernelArgumentType(
197         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
198         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
199   }
200 
201   // FIXME: This doesn't apply the optimization of coercing pointers in structs
202   // to global address space when using byref. This would require implementing a
203   // new kind of coercion of the in-memory type when for indirect arguments.
204   if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
205     return ABIArgInfo::getIndirectAliased(
206         getContext().getTypeAlignInChars(Ty),
207         getContext().getTargetAddressSpace(LangAS::opencl_constant),
208         false /*Realign*/, nullptr /*Padding*/);
209   }
210 
211   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
212   // individual elements, which confuses the Clover OpenCL backend; therefore we
213   // have to set it to false here. Other args of getDirect() are just defaults.
214   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
215 }
216 
217 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
218                                                unsigned &NumRegsLeft) const {
219   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
220 
221   Ty = useFirstFieldIfTransparentUnion(Ty);
222 
223   if (Variadic) {
224     return ABIArgInfo::getDirect(/*T=*/nullptr,
225                                  /*Offset=*/0,
226                                  /*Padding=*/nullptr,
227                                  /*CanBeFlattened=*/false,
228                                  /*Align=*/0);
229   }
230 
231   if (isAggregateTypeForABI(Ty)) {
232     // Records with non-trivial destructors/copy-constructors should not be
233     // passed by value.
234     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
235       return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
236                                      RAA == CGCXXABI::RAA_DirectInMemory);
237 
238     // Ignore empty structs/unions.
239     if (isEmptyRecord(getContext(), Ty, true))
240       return ABIArgInfo::getIgnore();
241 
242     // Lower single-element structs to just pass a regular value. TODO: We
243     // could do reasonable-size multiple-element structs too, using getExpand(),
244     // though watch out for things like bitfields.
245     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
246       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
247 
248     if (const RecordType *RT = Ty->getAs<RecordType>()) {
249       const RecordDecl *RD = RT->getDecl();
250       if (RD->hasFlexibleArrayMember())
251         return DefaultABIInfo::classifyArgumentType(Ty);
252     }
253 
254     // Pack aggregates <= 8 bytes into single VGPR or pair.
255     uint64_t Size = getContext().getTypeSize(Ty);
256     if (Size <= 64) {
257       unsigned NumRegs = (Size + 31) / 32;
258       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
259 
260       if (Size <= 16)
261         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
262 
263       if (Size <= 32)
264         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
265 
266       // XXX: Should this be i64 instead, and should the limit increase?
267       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
268       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
269     }
270 
271     if (NumRegsLeft > 0) {
272       unsigned NumRegs = numRegsForType(Ty);
273       if (NumRegsLeft >= NumRegs) {
274         NumRegsLeft -= NumRegs;
275         return ABIArgInfo::getDirect();
276       }
277     }
278 
279     // Use pass-by-reference in stead of pass-by-value for struct arguments in
280     // function ABI.
281     return ABIArgInfo::getIndirectAliased(
282         getContext().getTypeAlignInChars(Ty),
283         getContext().getTargetAddressSpace(LangAS::opencl_private));
284   }
285 
286   // Otherwise just do the default thing.
287   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
288   if (!ArgInfo.isIndirect()) {
289     unsigned NumRegs = numRegsForType(Ty);
290     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
291   }
292 
293   return ArgInfo;
294 }
295 
296 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
297 public:
298   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
299       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
300 
301   bool supportsLibCall() const override { return false; }
302   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
303                                  CodeGenModule &CGM) const;
304 
305   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
306                            CodeGen::CodeGenModule &M) const override;
307   unsigned getDeviceKernelCallingConv() const override;
308 
309   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
310       llvm::PointerType *T, QualType QT) const override;
311 
312   LangAS getASTAllocaAddressSpace() const override {
313     return getLangASFromTargetAS(
314         getABIInfo().getDataLayout().getAllocaAddrSpace());
315   }
316   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
317                                   const VarDecl *D) const override;
318   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
319                                          SyncScope Scope,
320                                          llvm::AtomicOrdering Ordering,
321                                          llvm::LLVMContext &Ctx) const override;
322   void setTargetAtomicMetadata(CodeGenFunction &CGF,
323                                llvm::Instruction &AtomicInst,
324                                const AtomicExpr *Expr = nullptr) const override;
325   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
326                                          llvm::Function *BlockInvokeFunc,
327                                          llvm::Type *BlockTy) const override;
328   bool shouldEmitStaticExternCAliases() const override;
329   bool shouldEmitDWARFBitFieldSeparators() const override;
330   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
331 };
332 }
333 
334 static bool requiresAMDGPUProtectedVisibility(const Decl *D,
335                                               llvm::GlobalValue *GV) {
336   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
337     return false;
338 
339   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
340          (D->hasAttr<DeviceKernelAttr>() ||
341           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
342           (isa<VarDecl>(D) &&
343            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
344             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
345             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
346 }
347 
348 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
349     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
350   const auto *ReqdWGS =
351       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
352   const bool IsOpenCLKernel =
353       M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
354   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
355 
356   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
357   if (ReqdWGS || FlatWGS) {
358     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
359   } else if (IsOpenCLKernel || IsHIPKernel) {
360     // By default, restrict the maximum size to a value specified by
361     // --gpu-max-threads-per-block=n or its default value for HIP.
362     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
363     const unsigned DefaultMaxWorkGroupSize =
364         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
365                        : M.getLangOpts().GPUMaxThreadsPerBlock;
366     std::string AttrVal =
367         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
368     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
369   }
370 
371   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
372     M.handleAMDGPUWavesPerEUAttr(F, Attr);
373 
374   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
375     unsigned NumSGPR = Attr->getNumSGPR();
376 
377     if (NumSGPR != 0)
378       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
379   }
380 
381   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
382     uint32_t NumVGPR = Attr->getNumVGPR();
383 
384     if (NumVGPR != 0)
385       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
386   }
387 
388   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
389     uint32_t X = Attr->getMaxNumWorkGroupsX()
390                      ->EvaluateKnownConstInt(M.getContext())
391                      .getExtValue();
392     // Y and Z dimensions default to 1 if not specified
393     uint32_t Y = Attr->getMaxNumWorkGroupsY()
394                      ? Attr->getMaxNumWorkGroupsY()
395                            ->EvaluateKnownConstInt(M.getContext())
396                            .getExtValue()
397                      : 1;
398     uint32_t Z = Attr->getMaxNumWorkGroupsZ()
399                      ? Attr->getMaxNumWorkGroupsZ()
400                            ->EvaluateKnownConstInt(M.getContext())
401                            .getExtValue()
402                      : 1;
403 
404     llvm::SmallString<32> AttrVal;
405     llvm::raw_svector_ostream OS(AttrVal);
406     OS << X << ',' << Y << ',' << Z;
407 
408     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
409   }
410 }
411 
412 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
413     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
414   if (requiresAMDGPUProtectedVisibility(D, GV)) {
415     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
416     GV->setDSOLocal(true);
417   }
418 
419   if (GV->isDeclaration())
420     return;
421 
422   llvm::Function *F = dyn_cast<llvm::Function>(GV);
423   if (!F)
424     return;
425 
426   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
427   if (FD)
428     setFunctionDeclAttributes(FD, F, M);
429 
430   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
431     F->addFnAttr("amdgpu-ieee", "false");
432 }
433 
434 unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
435   return llvm::CallingConv::AMDGPU_KERNEL;
436 }
437 
438 // Currently LLVM assumes null pointers always have value 0,
439 // which results in incorrectly transformed IR. Therefore, instead of
440 // emitting null pointers in private and local address spaces, a null
441 // pointer in generic address space is emitted which is casted to a
442 // pointer in local or private address space.
443 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
444     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
445     QualType QT) const {
446   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
447     return llvm::ConstantPointerNull::get(PT);
448 
449   auto &Ctx = CGM.getContext();
450   auto NPT = llvm::PointerType::get(
451       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
452   return llvm::ConstantExpr::getAddrSpaceCast(
453       llvm::ConstantPointerNull::get(NPT), PT);
454 }
455 
456 LangAS
457 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
458                                                   const VarDecl *D) const {
459   assert(!CGM.getLangOpts().OpenCL &&
460          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
461          "Address space agnostic languages only");
462   LangAS DefaultGlobalAS = getLangASFromTargetAS(
463       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
464   if (!D)
465     return DefaultGlobalAS;
466 
467   LangAS AddrSpace = D->getType().getAddressSpace();
468   if (AddrSpace != LangAS::Default)
469     return AddrSpace;
470 
471   // Only promote to address space 4 if VarDecl has constant initialization.
472   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
473       D->hasConstantInitialization()) {
474     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
475       return *ConstAS;
476   }
477   return DefaultGlobalAS;
478 }
479 
480 llvm::SyncScope::ID
481 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
482                                             SyncScope Scope,
483                                             llvm::AtomicOrdering Ordering,
484                                             llvm::LLVMContext &Ctx) const {
485   std::string Name;
486   switch (Scope) {
487   case SyncScope::HIPSingleThread:
488   case SyncScope::SingleScope:
489     Name = "singlethread";
490     break;
491   case SyncScope::HIPWavefront:
492   case SyncScope::OpenCLSubGroup:
493   case SyncScope::WavefrontScope:
494     Name = "wavefront";
495     break;
496   case SyncScope::HIPWorkgroup:
497   case SyncScope::OpenCLWorkGroup:
498   case SyncScope::WorkgroupScope:
499     Name = "workgroup";
500     break;
501   case SyncScope::HIPAgent:
502   case SyncScope::OpenCLDevice:
503   case SyncScope::DeviceScope:
504     Name = "agent";
505     break;
506   case SyncScope::SystemScope:
507   case SyncScope::HIPSystem:
508   case SyncScope::OpenCLAllSVMDevices:
509     Name = "";
510     break;
511   }
512 
513   // OpenCL assumes by default that atomic scopes are per-address space for
514   // non-sequentially consistent operations.
515   if (Scope >= SyncScope::OpenCLWorkGroup &&
516       Scope <= SyncScope::OpenCLSubGroup &&
517       Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
518     if (!Name.empty())
519       Name = Twine(Twine(Name) + Twine("-")).str();
520 
521     Name = Twine(Twine(Name) + Twine("one-as")).str();
522   }
523 
524   return Ctx.getOrInsertSyncScopeID(Name);
525 }
526 
527 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
528     CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
529     const AtomicExpr *AE) const {
530   auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
531   auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
532 
533   // OpenCL and old style HIP atomics consider atomics targeting thread private
534   // memory to be undefined.
535   //
536   // TODO: This is probably undefined for atomic load/store, but there's not
537   // much direct codegen benefit to knowing this.
538   if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
539        (CmpX &&
540         CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
541       AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
542     llvm::MDBuilder MDHelper(CGF.getLLVMContext());
543     llvm::MDNode *ASRange = MDHelper.createRange(
544         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
545         llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
546     AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
547   }
548 
549   if (!RMW)
550     return;
551 
552   AtomicOptions AO = CGF.CGM.getAtomicOpts();
553   llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
554   if (!AO.getOption(clang::AtomicOptionKind::FineGrainedMemory))
555     RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
556   if (!AO.getOption(clang::AtomicOptionKind::RemoteMemory))
557     RMW->setMetadata("amdgpu.no.remote.memory", Empty);
558   if (AO.getOption(clang::AtomicOptionKind::IgnoreDenormalMode) &&
559       RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
560       RMW->getType()->isFloatTy())
561     RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
562 }
563 
564 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
565   return false;
566 }
567 
568 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
569   return true;
570 }
571 
572 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
573     const FunctionType *&FT) const {
574   FT = getABIInfo().getContext().adjustFunctionType(
575       FT, FT->getExtInfo().withCallingConv(CC_DeviceKernel));
576 }
577 
578 /// Return IR struct type for rtinfo struct in rocm-device-libs used for device
579 /// enqueue.
580 ///
581 /// ptr addrspace(1) kernel_object, i32 private_segment_size,
582 /// i32 group_segment_size
583 
584 static llvm::StructType *
585 getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
586                            llvm::Type *KernelDescriptorPtrTy) {
587   llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
588   return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
589                                   "block.runtime.handle.t");
590 }
591 
592 /// Create an OpenCL kernel for an enqueued block.
593 ///
594 /// The type of the first argument (the block literal) is the struct type
595 /// of the block literal instead of a pointer type. The first argument
596 /// (block literal) is passed directly by value to the kernel. The kernel
597 /// allocates the same type of struct on stack and stores the block literal
598 /// to it and passes its pointer to the block invoke function. The kernel
599 /// has "enqueued-block" function attribute and kernel argument metadata.
600 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
601     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
602   auto &Builder = CGF.Builder;
603   auto &C = CGF.getLLVMContext();
604 
605   auto *InvokeFT = Invoke->getFunctionType();
606   llvm::SmallVector<llvm::Type *, 2> ArgTys;
607   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
608   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
609   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
610   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
611   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
612   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
613 
614   ArgTys.push_back(BlockTy);
615   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
616   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
617   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
618   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
619   AccessQuals.push_back(llvm::MDString::get(C, "none"));
620   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
621   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
622     ArgTys.push_back(InvokeFT->getParamType(I));
623     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
624     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
625     AccessQuals.push_back(llvm::MDString::get(C, "none"));
626     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
627     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
628     ArgNames.push_back(
629         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
630   }
631 
632   llvm::Module &Mod = CGF.CGM.getModule();
633   const llvm::DataLayout &DL = Mod.getDataLayout();
634 
635   llvm::Twine Name = Invoke->getName() + "_kernel";
636   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
637 
638   // The kernel itself can be internal, the runtime does not directly access the
639   // kernel address (only the kernel descriptor).
640   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
641                                    &Mod);
642   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
643 
644   llvm::AttrBuilder KernelAttrs(C);
645   // FIXME: The invoke isn't applying the right attributes either
646   // FIXME: This is missing setTargetAttributes
647   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
648   F->addFnAttrs(KernelAttrs);
649 
650   auto IP = CGF.Builder.saveIP();
651   auto *BB = llvm::BasicBlock::Create(C, "entry", F);
652   Builder.SetInsertPoint(BB);
653   const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
654   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
655   BlockPtr->setAlignment(BlockAlign);
656   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
657   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
658   llvm::SmallVector<llvm::Value *, 2> Args;
659   Args.push_back(Cast);
660   for (llvm::Argument &A : llvm::drop_begin(F->args()))
661     Args.push_back(&A);
662   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
663   call->setCallingConv(Invoke->getCallingConv());
664   Builder.CreateRetVoid();
665   Builder.restoreIP(IP);
666 
667   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
668   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
669   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
670   F->setMetadata("kernel_arg_base_type",
671                  llvm::MDNode::get(C, ArgBaseTypeNames));
672   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
673   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
674     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
675 
676   llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
677       C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
678   llvm::Constant *RuntimeHandleInitializer =
679       llvm::ConstantAggregateZero::get(HandleTy);
680 
681   llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
682 
683   // The runtime needs access to the runtime handle as an external symbol. The
684   // runtime handle will need to be made external later, in
685   // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
686   // inside the runtime handle, and is not directly referenced.
687 
688   // TODO: We would initialize the first field by declaring F->getName() + ".kd"
689   // to reference the kernel descriptor. The runtime wouldn't need to bother
690   // setting it. We would need to have a final symbol name though.
691   // TODO: Can we directly use an external symbol with getGlobalIdentifier?
692   auto *RuntimeHandle = new llvm::GlobalVariable(
693       Mod, HandleTy,
694       /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
695       /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
696       /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
697       DL.getDefaultGlobalsAddressSpace(),
698       /*isExternallyInitialized=*/true);
699 
700   llvm::MDNode *HandleAsMD =
701       llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
702   F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
703 
704   RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
705 
706   CGF.CGM.addUsedGlobal(F);
707   CGF.CGM.addUsedGlobal(RuntimeHandle);
708   return RuntimeHandle;
709 }
710 
711 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
712     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
713     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
714     int32_t *MaxThreadsVal) {
715   unsigned Min = 0;
716   unsigned Max = 0;
717   auto Eval = [&](Expr *E) {
718     return E->EvaluateKnownConstInt(getContext()).getExtValue();
719   };
720   if (FlatWGS) {
721     Min = Eval(FlatWGS->getMin());
722     Max = Eval(FlatWGS->getMax());
723   }
724   if (ReqdWGS && Min == 0 && Max == 0)
725     Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
726                 Eval(ReqdWGS->getZDim());
727 
728   if (Min != 0) {
729     assert(Min <= Max && "Min must be less than or equal Max");
730 
731     if (MinThreadsVal)
732       *MinThreadsVal = Min;
733     if (MaxThreadsVal)
734       *MaxThreadsVal = Max;
735     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
736     if (F)
737       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
738   } else
739     assert(Max == 0 && "Max must be zero");
740 }
741 
742 void CodeGenModule::handleAMDGPUWavesPerEUAttr(
743     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
744   unsigned Min =
745       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
746   unsigned Max =
747       Attr->getMax()
748           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
749           : 0;
750 
751   if (Min != 0) {
752     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
753 
754     std::string AttrVal = llvm::utostr(Min);
755     if (Max != 0)
756       AttrVal = AttrVal + "," + llvm::utostr(Max);
757     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
758   } else
759     assert(Max == 0 && "Max must be zero");
760 }
761 
762 std::unique_ptr<TargetCodeGenInfo>
763 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {
764   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
765 }
766