xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMDGPU HSA Metadata Streamer.
11 ///
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUHSAMetadataStreamer.h"
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIProgramInfo.h"
21 #include "llvm/IR/Module.h"
22 #include "llvm/MC/MCContext.h"
23 #include "llvm/MC/MCExpr.h"
24 #include "llvm/Target/TargetLoweringObjectFile.h"
25 
26 using namespace llvm;
27 
28 static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
29                                                      const DataLayout &DL) {
30   Type *Ty = Arg.getType();
31   MaybeAlign ArgAlign;
32   if (Arg.hasByRefAttr()) {
33     Ty = Arg.getParamByRefType();
34     ArgAlign = Arg.getParamAlign();
35   }
36 
37   if (!ArgAlign)
38     ArgAlign = DL.getABITypeAlign(Ty);
39 
40   return std::pair(Ty, *ArgAlign);
41 }
42 
43 /// Find the mangled symbol name for the runtime handle for \p EnqueuedBlock
44 static std::string getEnqueuedBlockSymbolName(const AMDGPUTargetMachine &TM,
45                                               const Function &EnqueuedBlock) {
46   const MDNode *Associated =
47       EnqueuedBlock.getMetadata(LLVMContext::MD_associated);
48   if (!Associated)
49     return "";
50 
51   auto *VM = cast<ValueAsMetadata>(Associated->getOperand(0));
52   auto *RuntimeHandle =
53       dyn_cast<GlobalVariable>(VM->getValue()->stripPointerCasts());
54   if (!RuntimeHandle ||
55       RuntimeHandle->getSection() != ".amdgpu.kernel.runtime.handle")
56     return "";
57 
58   SmallString<128> Name;
59   TM.getNameWithPrefix(Name, RuntimeHandle,
60                        TM.getObjFileLowering()->getMangler());
61   return Name.str().str();
62 }
63 
64 namespace llvm {
65 
66 static cl::opt<bool> DumpHSAMetadata(
67     "amdgpu-dump-hsa-metadata",
68     cl::desc("Dump AMDGPU HSA Metadata"));
69 static cl::opt<bool> VerifyHSAMetadata(
70     "amdgpu-verify-hsa-metadata",
71     cl::desc("Verify AMDGPU HSA Metadata"));
72 
73 namespace AMDGPU::HSAMD {
74 
75 //===----------------------------------------------------------------------===//
76 // HSAMetadataStreamerV4
77 //===----------------------------------------------------------------------===//
78 
79 void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {
80   errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
81 }
82 
83 void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
84   errs() << "AMDGPU HSA Metadata Parser Test: ";
85 
86   msgpack::Document FromHSAMetadataString;
87 
88   if (!FromHSAMetadataString.fromYAML(HSAMetadataString)) {
89     errs() << "FAIL\n";
90     return;
91   }
92 
93   std::string ToHSAMetadataString;
94   raw_string_ostream StrOS(ToHSAMetadataString);
95   FromHSAMetadataString.toYAML(StrOS);
96 
97   errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n';
98   if (HSAMetadataString != ToHSAMetadataString) {
99     errs() << "Original input: " << HSAMetadataString << '\n'
100            << "Produced output: " << StrOS.str() << '\n';
101   }
102 }
103 
104 std::optional<StringRef>
105 MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
106   return StringSwitch<std::optional<StringRef>>(AccQual)
107       .Case("read_only", StringRef("read_only"))
108       .Case("write_only", StringRef("write_only"))
109       .Case("read_write", StringRef("read_write"))
110       .Default(std::nullopt);
111 }
112 
113 std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
114     unsigned AddressSpace) const {
115   switch (AddressSpace) {
116   case AMDGPUAS::PRIVATE_ADDRESS:
117     return StringRef("private");
118   case AMDGPUAS::GLOBAL_ADDRESS:
119     return StringRef("global");
120   case AMDGPUAS::CONSTANT_ADDRESS:
121     return StringRef("constant");
122   case AMDGPUAS::LOCAL_ADDRESS:
123     return StringRef("local");
124   case AMDGPUAS::FLAT_ADDRESS:
125     return StringRef("generic");
126   case AMDGPUAS::REGION_ADDRESS:
127     return StringRef("region");
128   default:
129     return std::nullopt;
130   }
131 }
132 
133 StringRef
134 MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
135                                         StringRef BaseTypeName) const {
136   if (TypeQual.contains("pipe"))
137     return "pipe";
138 
139   return StringSwitch<StringRef>(BaseTypeName)
140       .Case("image1d_t", "image")
141       .Case("image1d_array_t", "image")
142       .Case("image1d_buffer_t", "image")
143       .Case("image2d_t", "image")
144       .Case("image2d_array_t", "image")
145       .Case("image2d_array_depth_t", "image")
146       .Case("image2d_array_msaa_t", "image")
147       .Case("image2d_array_msaa_depth_t", "image")
148       .Case("image2d_depth_t", "image")
149       .Case("image2d_msaa_t", "image")
150       .Case("image2d_msaa_depth_t", "image")
151       .Case("image3d_t", "image")
152       .Case("sampler_t", "sampler")
153       .Case("queue_t", "queue")
154       .Default(isa<PointerType>(Ty)
155                    ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
156                           ? "dynamic_shared_pointer"
157                           : "global_buffer")
158                    : "by_value");
159 }
160 
161 std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
162                                                    bool Signed) const {
163   switch (Ty->getTypeID()) {
164   case Type::IntegerTyID: {
165     if (!Signed)
166       return (Twine('u') + getTypeName(Ty, true)).str();
167 
168     auto BitWidth = Ty->getIntegerBitWidth();
169     switch (BitWidth) {
170     case 8:
171       return "char";
172     case 16:
173       return "short";
174     case 32:
175       return "int";
176     case 64:
177       return "long";
178     default:
179       return (Twine('i') + Twine(BitWidth)).str();
180     }
181   }
182   case Type::HalfTyID:
183     return "half";
184   case Type::FloatTyID:
185     return "float";
186   case Type::DoubleTyID:
187     return "double";
188   case Type::FixedVectorTyID: {
189     auto *VecTy = cast<FixedVectorType>(Ty);
190     auto *ElTy = VecTy->getElementType();
191     auto NumElements = VecTy->getNumElements();
192     return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
193   }
194   default:
195     return "unknown";
196   }
197 }
198 
199 msgpack::ArrayDocNode
200 MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
201   auto Dims = HSAMetadataDoc->getArrayNode();
202   if (Node->getNumOperands() != 3)
203     return Dims;
204 
205   for (auto &Op : Node->operands())
206     Dims.push_back(Dims.getDocument()->getNode(
207         uint64_t(mdconst::extract<ConstantInt>(Op)->getZExtValue())));
208   return Dims;
209 }
210 
211 void MetadataStreamerMsgPackV4::emitVersion() {
212   auto Version = HSAMetadataDoc->getArrayNode();
213   Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
214   Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
215   getRootMetadata("amdhsa.version") = Version;
216 }
217 
218 void MetadataStreamerMsgPackV4::emitTargetID(
219     const IsaInfo::AMDGPUTargetID &TargetID) {
220   getRootMetadata("amdhsa.target") =
221       HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
222 }
223 
224 void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
225   auto *Node = Mod.getNamedMetadata("llvm.printf.fmts");
226   if (!Node)
227     return;
228 
229   auto Printf = HSAMetadataDoc->getArrayNode();
230   for (auto *Op : Node->operands())
231     if (Op->getNumOperands())
232       Printf.push_back(Printf.getDocument()->getNode(
233           cast<MDString>(Op->getOperand(0))->getString(), /*Copy=*/true));
234   getRootMetadata("amdhsa.printf") = Printf;
235 }
236 
237 void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
238                                                    msgpack::MapDocNode Kern) {
239   // TODO: What about other languages?
240   auto *Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
241   if (!Node || !Node->getNumOperands())
242     return;
243   auto *Op0 = Node->getOperand(0);
244   if (Op0->getNumOperands() <= 1)
245     return;
246 
247   Kern[".language"] = Kern.getDocument()->getNode("OpenCL C");
248   auto LanguageVersion = Kern.getDocument()->getArrayNode();
249   LanguageVersion.push_back(Kern.getDocument()->getNode(
250       mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
251   LanguageVersion.push_back(Kern.getDocument()->getNode(
252       mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
253   Kern[".language_version"] = LanguageVersion;
254 }
255 
256 void MetadataStreamerMsgPackV4::emitKernelAttrs(const AMDGPUTargetMachine &TM,
257                                                 const Function &Func,
258                                                 msgpack::MapDocNode Kern) {
259 
260   if (auto *Node = Func.getMetadata("reqd_work_group_size"))
261     Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
262   if (auto *Node = Func.getMetadata("work_group_size_hint"))
263     Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
264   if (auto *Node = Func.getMetadata("vec_type_hint")) {
265     Kern[".vec_type_hint"] = Kern.getDocument()->getNode(
266         getTypeName(
267             cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
268             mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),
269         /*Copy=*/true);
270   }
271 
272   std::string HandleName = getEnqueuedBlockSymbolName(TM, Func);
273   if (!HandleName.empty()) {
274     Kern[".device_enqueue_symbol"] =
275         Kern.getDocument()->getNode(std::move(HandleName), /*Copy=*/true);
276   }
277 
278   if (Func.hasFnAttribute("device-init"))
279     Kern[".kind"] = Kern.getDocument()->getNode("init");
280   else if (Func.hasFnAttribute("device-fini"))
281     Kern[".kind"] = Kern.getDocument()->getNode("fini");
282 }
283 
284 void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
285                                                msgpack::MapDocNode Kern) {
286   auto &Func = MF.getFunction();
287   unsigned Offset = 0;
288   auto Args = HSAMetadataDoc->getArrayNode();
289   for (auto &Arg : Func.args()) {
290     if (Arg.hasAttribute("amdgpu-hidden-argument"))
291       continue;
292 
293     emitKernelArg(Arg, Offset, Args);
294   }
295 
296   emitHiddenKernelArgs(MF, Offset, Args);
297 
298   Kern[".args"] = Args;
299 }
300 
301 void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
302                                               unsigned &Offset,
303                                               msgpack::ArrayDocNode Args) {
304   const auto *Func = Arg.getParent();
305   auto ArgNo = Arg.getArgNo();
306   const MDNode *Node;
307 
308   StringRef Name;
309   Node = Func->getMetadata("kernel_arg_name");
310   if (Node && ArgNo < Node->getNumOperands())
311     Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
312   else if (Arg.hasName())
313     Name = Arg.getName();
314 
315   StringRef TypeName;
316   Node = Func->getMetadata("kernel_arg_type");
317   if (Node && ArgNo < Node->getNumOperands())
318     TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
319 
320   StringRef BaseTypeName;
321   Node = Func->getMetadata("kernel_arg_base_type");
322   if (Node && ArgNo < Node->getNumOperands())
323     BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
324 
325   StringRef ActAccQual;
326   // Do we really need NoAlias check here?
327   if (Arg.getType()->isPointerTy() && Arg.hasNoAliasAttr()) {
328     if (Arg.onlyReadsMemory())
329       ActAccQual = "read_only";
330     else if (Arg.hasAttribute(Attribute::WriteOnly))
331       ActAccQual = "write_only";
332   }
333 
334   StringRef AccQual;
335   Node = Func->getMetadata("kernel_arg_access_qual");
336   if (Node && ArgNo < Node->getNumOperands())
337     AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
338 
339   StringRef TypeQual;
340   Node = Func->getMetadata("kernel_arg_type_qual");
341   if (Node && ArgNo < Node->getNumOperands())
342     TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
343 
344   const DataLayout &DL = Func->getDataLayout();
345 
346   MaybeAlign PointeeAlign;
347   Type *Ty = Arg.hasByRefAttr() ? Arg.getParamByRefType() : Arg.getType();
348 
349   // FIXME: Need to distinguish in memory alignment from pointer alignment.
350   if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {
351     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
352       PointeeAlign = Arg.getParamAlign().valueOrOne();
353   }
354 
355   // There's no distinction between byval aggregates and raw aggregates.
356   Type *ArgTy;
357   Align ArgAlign;
358   std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
359 
360   emitKernelArg(DL, ArgTy, ArgAlign,
361                 getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
362                 PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
363                 AccQual, TypeQual);
364 }
365 
366 void MetadataStreamerMsgPackV4::emitKernelArg(
367     const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
368     unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
369     StringRef Name, StringRef TypeName, StringRef BaseTypeName,
370     StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
371   auto Arg = Args.getDocument()->getMapNode();
372 
373   if (!Name.empty())
374     Arg[".name"] = Arg.getDocument()->getNode(Name, /*Copy=*/true);
375   if (!TypeName.empty())
376     Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
377   auto Size = DL.getTypeAllocSize(Ty);
378   Arg[".size"] = Arg.getDocument()->getNode(Size);
379   Offset = alignTo(Offset, Alignment);
380   Arg[".offset"] = Arg.getDocument()->getNode(Offset);
381   Offset += Size;
382   Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true);
383   if (PointeeAlign)
384     Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign->value());
385 
386   if (auto *PtrTy = dyn_cast<PointerType>(Ty))
387     if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
388       // Limiting address space to emit only for a certain ValueKind.
389       if (ValueKind == "global_buffer" || ValueKind == "dynamic_shared_pointer")
390         Arg[".address_space"] = Arg.getDocument()->getNode(*Qualifier,
391                                                            /*Copy=*/true);
392 
393   if (auto AQ = getAccessQualifier(AccQual))
394     Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true);
395 
396   if (auto AAQ = getAccessQualifier(ActAccQual))
397     Arg[".actual_access"] = Arg.getDocument()->getNode(*AAQ, /*Copy=*/true);
398 
399   SmallVector<StringRef, 1> SplitTypeQuals;
400   TypeQual.split(SplitTypeQuals, " ", -1, false);
401   for (StringRef Key : SplitTypeQuals) {
402     if (Key == "const")
403       Arg[".is_const"] = Arg.getDocument()->getNode(true);
404     else if (Key == "restrict")
405       Arg[".is_restrict"] = Arg.getDocument()->getNode(true);
406     else if (Key == "volatile")
407       Arg[".is_volatile"] = Arg.getDocument()->getNode(true);
408     else if (Key == "pipe")
409       Arg[".is_pipe"] = Arg.getDocument()->getNode(true);
410   }
411 
412   Args.push_back(Arg);
413 }
414 
415 void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
416     const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
417   auto &Func = MF.getFunction();
418   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
419 
420   unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
421   if (!HiddenArgNumBytes)
422     return;
423 
424   const Module *M = Func.getParent();
425   auto &DL = M->getDataLayout();
426   auto *Int64Ty = Type::getInt64Ty(Func.getContext());
427 
428   Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
429 
430   if (HiddenArgNumBytes >= 8)
431     emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset,
432                   Args);
433   if (HiddenArgNumBytes >= 16)
434     emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset,
435                   Args);
436   if (HiddenArgNumBytes >= 24)
437     emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset,
438                   Args);
439 
440   auto *Int8PtrTy =
441       PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
442 
443   if (HiddenArgNumBytes >= 32) {
444     // We forbid the use of features requiring hostcall when compiling OpenCL
445     // before code object V5, which makes the mutual exclusion between the
446     // "printf buffer" and "hostcall buffer" here sound.
447     if (M->getNamedMetadata("llvm.printf.fmts"))
448       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
449                     Args);
450     else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
451       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
452                     Args);
453     else
454       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
455   }
456 
457   // Emit "default queue" and "completion action" arguments if enqueue kernel is
458   // used, otherwise emit dummy "none" arguments.
459   if (HiddenArgNumBytes >= 40) {
460     if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
461       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
462                     Args);
463     } else {
464       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
465     }
466   }
467 
468   if (HiddenArgNumBytes >= 48) {
469     if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
470       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
471                     Args);
472     } else {
473       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
474     }
475   }
476 
477   // Emit the pointer argument for multi-grid object.
478   if (HiddenArgNumBytes >= 56) {
479     if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
480       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
481                     Args);
482     } else {
483       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
484     }
485   }
486 }
487 
488 msgpack::MapDocNode
489 MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
490                                              const SIProgramInfo &ProgramInfo,
491                                              unsigned CodeObjectVersion) const {
492   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
493   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
494   const Function &F = MF.getFunction();
495 
496   auto Kern = HSAMetadataDoc->getMapNode();
497 
498   Align MaxKernArgAlign;
499   Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
500       STM.getKernArgSegmentSize(F, MaxKernArgAlign));
501   Kern[".group_segment_fixed_size"] =
502       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
503   DelayedExprs->assignDocNode(Kern[".private_segment_fixed_size"],
504                               msgpack::Type::UInt, ProgramInfo.ScratchSize);
505   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
506     DelayedExprs->assignDocNode(Kern[".uses_dynamic_stack"],
507                                 msgpack::Type::Boolean,
508                                 ProgramInfo.DynamicCallStack);
509   }
510 
511   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
512     Kern[".workgroup_processor_mode"] =
513         Kern.getDocument()->getNode(ProgramInfo.WgpMode);
514 
515   // FIXME: The metadata treats the minimum as 16?
516   Kern[".kernarg_segment_align"] =
517       Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
518   Kern[".wavefront_size"] =
519       Kern.getDocument()->getNode(STM.getWavefrontSize());
520   DelayedExprs->assignDocNode(Kern[".sgpr_count"], msgpack::Type::UInt,
521                               ProgramInfo.NumSGPR);
522   DelayedExprs->assignDocNode(Kern[".vgpr_count"], msgpack::Type::UInt,
523                               ProgramInfo.NumVGPR);
524 
525   // Only add AGPR count to metadata for supported devices
526   if (STM.hasMAIInsts()) {
527     DelayedExprs->assignDocNode(Kern[".agpr_count"], msgpack::Type::UInt,
528                                 ProgramInfo.NumAccVGPR);
529   }
530 
531   Kern[".max_flat_workgroup_size"] =
532       Kern.getDocument()->getNode(MFI.getMaxFlatWorkGroupSize());
533 
534   uint32_t NumWGY = MFI.getMaxNumWorkGroupsY();
535   uint32_t NumWGZ = MFI.getMaxNumWorkGroupsZ();
536   uint32_t NumWGX = MFI.getMaxNumWorkGroupsX();
537 
538   // TODO: Should consider 0 invalid and reject in IR verifier.
539   if (NumWGX != std::numeric_limits<uint32_t>::max() && NumWGX != 0)
540     Kern[".max_num_workgroups_x"] = Kern.getDocument()->getNode(NumWGX);
541 
542   if (NumWGY != std::numeric_limits<uint32_t>::max() && NumWGY != 0)
543     Kern[".max_num_workgroups_y"] = Kern.getDocument()->getNode(NumWGY);
544 
545   if (NumWGZ != std::numeric_limits<uint32_t>::max() && NumWGZ != 0)
546     Kern[".max_num_workgroups_z"] = Kern.getDocument()->getNode(NumWGZ);
547 
548   Kern[".sgpr_spill_count"] =
549       Kern.getDocument()->getNode(MFI.getNumSpilledSGPRs());
550   Kern[".vgpr_spill_count"] =
551       Kern.getDocument()->getNode(MFI.getNumSpilledVGPRs());
552 
553   return Kern;
554 }
555 
556 bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
557   DelayedExprs->resolveDelayedExpressions();
558   return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
559 }
560 
561 void MetadataStreamerMsgPackV4::begin(const Module &Mod,
562                                       const IsaInfo::AMDGPUTargetID &TargetID) {
563   emitVersion();
564   emitTargetID(TargetID);
565   emitPrintf(Mod);
566   getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
567   DelayedExprs->clear();
568 }
569 
570 void MetadataStreamerMsgPackV4::end() {
571   DelayedExprs->resolveDelayedExpressions();
572   std::string HSAMetadataString;
573   raw_string_ostream StrOS(HSAMetadataString);
574   HSAMetadataDoc->toYAML(StrOS);
575 
576   if (DumpHSAMetadata)
577     dump(StrOS.str());
578   if (VerifyHSAMetadata)
579     verify(StrOS.str());
580 }
581 
582 void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
583                                            const SIProgramInfo &ProgramInfo) {
584   auto &Func = MF.getFunction();
585   if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
586       Func.getCallingConv() != CallingConv::SPIR_KERNEL)
587     return;
588 
589   auto CodeObjectVersion =
590       AMDGPU::getAMDHSACodeObjectVersion(*Func.getParent());
591   auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);
592 
593   auto Kernels =
594       getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true);
595 
596   auto &TM = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
597   {
598     Kern[".name"] = Kern.getDocument()->getNode(Func.getName());
599     Kern[".symbol"] = Kern.getDocument()->getNode(
600         (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
601     emitKernelLanguage(Func, Kern);
602     emitKernelAttrs(TM, Func, Kern);
603     emitKernelArgs(MF, Kern);
604   }
605 
606   Kernels.push_back(Kern);
607 }
608 
609 //===----------------------------------------------------------------------===//
610 // HSAMetadataStreamerV5
611 //===----------------------------------------------------------------------===//
612 
613 void MetadataStreamerMsgPackV5::emitVersion() {
614   auto Version = HSAMetadataDoc->getArrayNode();
615   Version.push_back(Version.getDocument()->getNode(VersionMajorV5));
616   Version.push_back(Version.getDocument()->getNode(VersionMinorV5));
617   getRootMetadata("amdhsa.version") = Version;
618 }
619 
620 void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
621     const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
622   auto &Func = MF.getFunction();
623   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
624 
625   // No implicit kernel argument is used.
626   if (ST.getImplicitArgNumBytes(Func) == 0)
627     return;
628 
629   const Module *M = Func.getParent();
630   auto &DL = M->getDataLayout();
631   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
632 
633   auto *Int64Ty = Type::getInt64Ty(Func.getContext());
634   auto *Int32Ty = Type::getInt32Ty(Func.getContext());
635   auto *Int16Ty = Type::getInt16Ty(Func.getContext());
636 
637   Offset = alignTo(Offset, ST.getAlignmentForImplicitArgPtr());
638   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_x", Offset, Args);
639   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_y", Offset, Args);
640   emitKernelArg(DL, Int32Ty, Align(4), "hidden_block_count_z", Offset, Args);
641 
642   emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_x", Offset, Args);
643   emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_y", Offset, Args);
644   emitKernelArg(DL, Int16Ty, Align(2), "hidden_group_size_z", Offset, Args);
645 
646   emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_x", Offset, Args);
647   emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_y", Offset, Args);
648   emitKernelArg(DL, Int16Ty, Align(2), "hidden_remainder_z", Offset, Args);
649 
650   // Reserved for hidden_tool_correlation_id.
651   Offset += 8;
652 
653   Offset += 8; // Reserved.
654 
655   emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_x", Offset, Args);
656   emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_y", Offset, Args);
657   emitKernelArg(DL, Int64Ty, Align(8), "hidden_global_offset_z", Offset, Args);
658 
659   emitKernelArg(DL, Int16Ty, Align(2), "hidden_grid_dims", Offset, Args);
660 
661   Offset += 6; // Reserved.
662   auto *Int8PtrTy =
663       PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
664 
665   if (M->getNamedMetadata("llvm.printf.fmts")) {
666     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
667                   Args);
668   } else {
669     Offset += 8; // Skipped.
670   }
671 
672   if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
673     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
674                   Args);
675   } else {
676     Offset += 8; // Skipped.
677   }
678 
679   if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg")) {
680     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_multigrid_sync_arg", Offset,
681                 Args);
682   } else {
683     Offset += 8; // Skipped.
684   }
685 
686   if (!Func.hasFnAttribute("amdgpu-no-heap-ptr"))
687     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_heap_v1", Offset, Args);
688   else
689     Offset += 8; // Skipped.
690 
691   if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
692     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_default_queue", Offset,
693                   Args);
694   } else {
695     Offset += 8; // Skipped.
696   }
697 
698   if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
699     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
700                   Args);
701   } else {
702     Offset += 8; // Skipped.
703   }
704 
705   // Emit argument for hidden dynamic lds size
706   if (MFI.isDynamicLDSUsed()) {
707     emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset,
708                   Args);
709   } else {
710     Offset += 4; // skipped
711   }
712 
713   Offset += 68; // Reserved.
714 
715   // hidden_private_base and hidden_shared_base are only when the subtarget has
716   // ApertureRegs.
717   if (!ST.hasApertureRegs()) {
718     emitKernelArg(DL, Int32Ty, Align(4), "hidden_private_base", Offset, Args);
719     emitKernelArg(DL, Int32Ty, Align(4), "hidden_shared_base", Offset, Args);
720   } else {
721     Offset += 8; // Skipped.
722   }
723 
724   if (MFI.getUserSGPRInfo().hasQueuePtr())
725     emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
726 }
727 
728 void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
729                                                 const Function &Func,
730                                                 msgpack::MapDocNode Kern) {
731   MetadataStreamerMsgPackV4::emitKernelAttrs(TM, Func, Kern);
732 
733   if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
734     Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
735 }
736 
737 //===----------------------------------------------------------------------===//
738 // HSAMetadataStreamerV6
739 //===----------------------------------------------------------------------===//
740 
741 void MetadataStreamerMsgPackV6::emitVersion() {
742   auto Version = HSAMetadataDoc->getArrayNode();
743   Version.push_back(Version.getDocument()->getNode(VersionMajorV6));
744   Version.push_back(Version.getDocument()->getNode(VersionMinorV6));
745   getRootMetadata("amdhsa.version") = Version;
746 }
747 
748 } // end namespace AMDGPU::HSAMD
749 } // end namespace llvm
750