1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUHSAMetadataStreamer.h" 21 #include "AMDGPUResourceUsageAnalysis.h" 22 #include "AMDKernelCodeT.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUInstPrinter.h" 25 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 26 #include "R600AsmPrinter.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "TargetInfo/AMDGPUTargetInfo.h" 29 #include "Utils/AMDGPUBaseInfo.h" 30 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 31 #include "llvm/BinaryFormat/ELF.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/MC/MCAssembler.h" 36 #include "llvm/MC/MCContext.h" 37 #include "llvm/MC/MCSectionELF.h" 38 #include "llvm/MC/MCStreamer.h" 39 #include "llvm/MC/TargetRegistry.h" 40 #include "llvm/Support/AMDHSAKernelDescriptor.h" 41 #include "llvm/Target/TargetLoweringObjectFile.h" 42 #include "llvm/Target/TargetMachine.h" 43 #include "llvm/TargetParser/TargetParser.h" 44 45 using namespace llvm; 46 using namespace llvm::AMDGPU; 47 48 // This should get the default rounding mode from the kernel. We just set the 49 // default here, but this could change if the OpenCL rounding mode pragmas are 50 // used. 51 // 52 // The denormal mode here should match what is reported by the OpenCL runtime 53 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 54 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 55 // 56 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 57 // precision, and leaves single precision to flush all and does not report 58 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 59 // CL_FP_DENORM for both. 60 // 61 // FIXME: It seems some instructions do not support single precision denormals 62 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 63 // and sin_f32, cos_f32 on most parts). 64 65 // We want to use these instructions, and using fp32 denormals also causes 66 // instructions to run at the double precision rate for the device so it's 67 // probably best to just report no single precision denormals. 68 static uint32_t getFPMode(SIModeRegisterDefaults Mode) { 69 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 70 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 73 } 74 75 static AsmPrinter * 76 createAMDGPUAsmPrinterPass(TargetMachine &tm, 77 std::unique_ptr<MCStreamer> &&Streamer) { 78 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 79 } 80 81 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { 82 TargetRegistry::RegisterAsmPrinter(getTheR600Target(), 83 llvm::createR600AsmPrinterPass); 84 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 85 createAMDGPUAsmPrinterPass); 86 } 87 88 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 89 std::unique_ptr<MCStreamer> Streamer) 90 : AsmPrinter(TM, std::move(Streamer)) { 91 assert(OutStreamer && "AsmPrinter constructed without streamer"); 92 } 93 94 StringRef AMDGPUAsmPrinter::getPassName() const { 95 return "AMDGPU Assembly Printer"; 96 } 97 98 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 99 return TM.getMCSubtargetInfo(); 100 } 101 102 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 103 if (!OutStreamer) 104 return nullptr; 105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 106 } 107 108 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 109 IsTargetStreamerInitialized = false; 110 } 111 112 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { 113 IsTargetStreamerInitialized = true; 114 115 // TODO: Which one is called first, emitStartOfAsmFile or 116 // emitFunctionBodyStart? 117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) 118 initializeTargetID(M); 119 120 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 121 TM.getTargetTriple().getOS() != Triple::AMDPAL) 122 return; 123 124 if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) 125 getTargetStreamer()->EmitDirectiveAMDGCNTarget(); 126 127 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 128 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); 129 130 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 131 getTargetStreamer()->getPALMetadata()->readFromIR(M); 132 133 if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) 134 return; 135 136 // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. 137 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 138 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); 139 140 // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2. 141 IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); 142 getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2( 143 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); 144 } 145 146 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 147 // Init target streamer if it has not yet happened 148 if (!IsTargetStreamerInitialized) 149 initTargetStreamer(M); 150 151 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 152 CodeObjectVersion == AMDGPU::AMDHSA_COV2) 153 getTargetStreamer()->EmitISAVersion(); 154 155 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 156 // Emit HSA Metadata (NT_AMD_HSA_METADATA). 157 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 158 HSAMetadataStream->end(); 159 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 160 (void)Success; 161 assert(Success && "Malformed HSA Metadata"); 162 } 163 } 164 165 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 166 const MachineBasicBlock *MBB) const { 167 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 168 return false; 169 170 if (MBB->empty()) 171 return true; 172 173 // If this is a block implementing a long branch, an expression relative to 174 // the start of the block is needed. to the start of the block. 175 // XXX - Is there a smarter way to check this? 176 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 177 } 178 179 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 180 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 181 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 182 const Function &F = MF->getFunction(); 183 184 // TODO: Which one is called first, emitStartOfAsmFile or 185 // emitFunctionBodyStart? 186 if (!getTargetStreamer()->getTargetID()) 187 initializeTargetID(*F.getParent()); 188 189 const auto &FunctionTargetID = STM.getTargetID(); 190 // Make sure function's xnack settings are compatible with module's 191 // xnack settings. 192 if (FunctionTargetID.isXnackSupported() && 193 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && 194 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { 195 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + 196 "' function does not match module xnack setting"); 197 return; 198 } 199 // Make sure function's sramecc settings are compatible with module's 200 // sramecc settings. 201 if (FunctionTargetID.isSramEccSupported() && 202 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && 203 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { 204 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + 205 "' function does not match module sramecc setting"); 206 return; 207 } 208 209 if (!MFI.isEntryFunction()) 210 return; 211 212 if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) && 213 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 214 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 215 amd_kernel_code_t KernelCode; 216 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 217 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 218 } 219 220 if (STM.isAmdHsaOS()) 221 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 222 } 223 224 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 225 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 226 if (!MFI.isEntryFunction()) 227 return; 228 229 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 230 CodeObjectVersion == AMDGPU::AMDHSA_COV2) 231 return; 232 233 auto &Streamer = getTargetStreamer()->getStreamer(); 234 auto &Context = Streamer.getContext(); 235 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 236 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 237 238 Streamer.pushSection(); 239 Streamer.switchSection(&ReadOnlySection); 240 241 // CP microcode requires the kernel descriptor to be allocated on 64 byte 242 // alignment. 243 Streamer.emitValueToAlignment(Align(64), 0, 1, 0); 244 ReadOnlySection.ensureMinAlignment(Align(64)); 245 246 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 247 248 SmallString<128> KernelName; 249 getNameWithPrefix(KernelName, &MF->getFunction()); 250 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 251 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 252 CurrentProgramInfo.NumVGPRsForWavesPerEU, 253 CurrentProgramInfo.NumSGPRsForWavesPerEU - 254 IsaInfo::getNumExtraSGPRs( 255 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 256 getTargetStreamer()->getTargetID()->isXnackOnOrAny()), 257 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 258 CodeObjectVersion); 259 260 Streamer.popSection(); 261 } 262 263 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 264 if (TM.getTargetTriple().getOS() == Triple::AMDHSA && 265 CodeObjectVersion >= AMDGPU::AMDHSA_COV3) { 266 AsmPrinter::emitFunctionEntryLabel(); 267 return; 268 } 269 270 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 271 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 272 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 273 SmallString<128> SymbolName; 274 getNameWithPrefix(SymbolName, &MF->getFunction()), 275 getTargetStreamer()->EmitAMDGPUSymbolType( 276 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 277 } 278 if (DumpCodeInstEmitter) { 279 // Disassemble function name label to text. 280 DisasmLines.push_back(MF->getName().str() + ":"); 281 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 282 HexLines.push_back(""); 283 } 284 285 AsmPrinter::emitFunctionEntryLabel(); 286 } 287 288 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 289 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 290 // Write a line for the basic block label if it is not only fallthrough. 291 DisasmLines.push_back( 292 (Twine("BB") + Twine(getFunctionNumber()) 293 + "_" + Twine(MBB.getNumber()) + ":").str()); 294 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 295 HexLines.push_back(""); 296 } 297 AsmPrinter::emitBasicBlockStart(MBB); 298 } 299 300 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 301 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 302 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 303 OutContext.reportError({}, 304 Twine(GV->getName()) + 305 ": unsupported initializer for address space"); 306 return; 307 } 308 309 // LDS variables aren't emitted in HSA or PAL yet. 310 const Triple::OSType OS = TM.getTargetTriple().getOS(); 311 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 312 return; 313 314 MCSymbol *GVSym = getSymbol(GV); 315 316 GVSym->redefineIfPossible(); 317 if (GVSym->isDefined() || GVSym->isVariable()) 318 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 319 "' is already defined"); 320 321 const DataLayout &DL = GV->getParent()->getDataLayout(); 322 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 323 Align Alignment = GV->getAlign().value_or(Align(4)); 324 325 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 326 emitLinkage(GV, GVSym); 327 auto TS = getTargetStreamer(); 328 TS->emitAMDGPULDS(GVSym, Size, Alignment); 329 return; 330 } 331 332 AsmPrinter::emitGlobalVariable(GV); 333 } 334 335 bool AMDGPUAsmPrinter::doInitialization(Module &M) { 336 CodeObjectVersion = AMDGPU::getCodeObjectVersion(M); 337 338 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 339 switch (CodeObjectVersion) { 340 case AMDGPU::AMDHSA_COV2: 341 HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2()); 342 break; 343 case AMDGPU::AMDHSA_COV3: 344 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); 345 break; 346 case AMDGPU::AMDHSA_COV4: 347 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); 348 break; 349 case AMDGPU::AMDHSA_COV5: 350 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5()); 351 break; 352 default: 353 report_fatal_error("Unexpected code object version"); 354 } 355 } 356 return AsmPrinter::doInitialization(M); 357 } 358 359 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 360 // Pad with s_code_end to help tools and guard against instruction prefetch 361 // causing stale data in caches. Arguably this should be done by the linker, 362 // which is why this isn't done for Mesa. 363 const MCSubtargetInfo &STI = *getGlobalSTI(); 364 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && 365 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 366 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 367 OutStreamer->switchSection(getObjFileLowering().getTextSection()); 368 getTargetStreamer()->EmitCodeEnd(STI); 369 } 370 371 return AsmPrinter::doFinalization(M); 372 } 373 374 // Print comments that apply to both callable functions and entry points. 375 void AMDGPUAsmPrinter::emitCommonFunctionComments( 376 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR, 377 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, 378 const AMDGPUMachineFunction *MFI) { 379 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 380 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); 381 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); 382 if (NumAGPR) { 383 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); 384 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), 385 false); 386 } 387 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); 388 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 389 false); 390 } 391 392 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 393 const MachineFunction &MF) const { 394 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 395 uint16_t KernelCodeProperties = 0; 396 397 if (MFI.hasPrivateSegmentBuffer()) { 398 KernelCodeProperties |= 399 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 400 } 401 if (MFI.hasDispatchPtr()) { 402 KernelCodeProperties |= 403 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 404 } 405 if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { 406 KernelCodeProperties |= 407 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 408 } 409 if (MFI.hasKernargSegmentPtr()) { 410 KernelCodeProperties |= 411 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 412 } 413 if (MFI.hasDispatchID()) { 414 KernelCodeProperties |= 415 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 416 } 417 if (MFI.hasFlatScratchInit()) { 418 KernelCodeProperties |= 419 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 420 } 421 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 422 KernelCodeProperties |= 423 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 424 } 425 426 if (CurrentProgramInfo.DynamicCallStack && 427 CodeObjectVersion >= AMDGPU::AMDHSA_COV5) 428 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; 429 430 return KernelCodeProperties; 431 } 432 433 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( 434 const MachineFunction &MF, 435 const SIProgramInfo &PI) const { 436 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 437 const Function &F = MF.getFunction(); 438 439 amdhsa::kernel_descriptor_t KernelDescriptor; 440 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); 441 442 assert(isUInt<32>(PI.ScratchSize)); 443 assert(isUInt<32>(PI.getComputePGMRSrc1())); 444 assert(isUInt<32>(PI.getComputePGMRSrc2())); 445 446 KernelDescriptor.group_segment_fixed_size = PI.LDSSize; 447 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 448 449 Align MaxKernArgAlign; 450 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 451 452 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); 453 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(); 454 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 455 456 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); 457 if (STM.hasGFX90AInsts()) 458 KernelDescriptor.compute_pgm_rsrc3 = 459 CurrentProgramInfo.ComputePGMRSrc3GFX90A; 460 461 return KernelDescriptor; 462 } 463 464 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 465 // Init target streamer lazily on the first function so that previous passes 466 // can set metadata. 467 if (!IsTargetStreamerInitialized) 468 initTargetStreamer(*MF.getFunction().getParent()); 469 470 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); 471 CurrentProgramInfo = SIProgramInfo(); 472 473 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 474 475 // The starting address of all shader programs must be 256 bytes aligned. 476 // Regular functions just need the basic required instruction alignment. 477 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 478 479 SetupMachineFunction(MF); 480 481 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 482 MCContext &Context = getObjFileLowering().getContext(); 483 // FIXME: This should be an explicit check for Mesa. 484 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 485 MCSectionELF *ConfigSection = 486 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 487 OutStreamer->switchSection(ConfigSection); 488 } 489 490 if (MFI->isModuleEntryFunction()) { 491 getSIProgramInfo(CurrentProgramInfo, MF); 492 } 493 494 if (STM.isAmdPalOS()) { 495 if (MFI->isEntryFunction()) 496 EmitPALMetadata(MF, CurrentProgramInfo); 497 else if (MFI->isModuleEntryFunction()) 498 emitPALFunctionMetadata(MF); 499 } else if (!STM.isAmdHsaOS()) { 500 EmitProgramInfoSI(MF, CurrentProgramInfo); 501 } 502 503 DumpCodeInstEmitter = nullptr; 504 if (STM.dumpCode()) { 505 // For -dumpcode, get the assembler out of the streamer, even if it does 506 // not really want to let us have it. This only works with -filetype=obj. 507 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); 508 OutStreamer->setUseAssemblerInfoForParsing(true); 509 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 510 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); 511 if (Assembler) 512 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 513 } 514 515 DisasmLines.clear(); 516 HexLines.clear(); 517 DisasmLineMaxLen = 0; 518 519 emitFunctionBody(); 520 521 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), 522 STM.hasMAIInsts()); 523 524 if (isVerbose()) { 525 MCSectionELF *CommentSection = 526 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 527 OutStreamer->switchSection(CommentSection); 528 529 if (!MFI->isEntryFunction()) { 530 OutStreamer->emitRawComment(" Function info:", false); 531 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 532 ResourceUsage->getResourceInfo(&MF.getFunction()); 533 emitCommonFunctionComments( 534 Info.NumVGPR, 535 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(), 536 Info.getTotalNumVGPRs(STM), 537 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), 538 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); 539 return false; 540 } 541 542 OutStreamer->emitRawComment(" Kernel info:", false); 543 emitCommonFunctionComments( 544 CurrentProgramInfo.NumArchVGPR, 545 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR 546 : std::optional<uint32_t>(), 547 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, 548 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); 549 550 OutStreamer->emitRawComment( 551 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 552 OutStreamer->emitRawComment( 553 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 554 OutStreamer->emitRawComment( 555 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 556 " bytes/workgroup (compile time only)", false); 557 558 OutStreamer->emitRawComment( 559 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); 560 OutStreamer->emitRawComment( 561 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); 562 563 OutStreamer->emitRawComment( 564 " NumSGPRsForWavesPerEU: " + 565 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); 566 OutStreamer->emitRawComment( 567 " NumVGPRsForWavesPerEU: " + 568 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); 569 570 if (STM.hasGFX90AInsts()) 571 OutStreamer->emitRawComment( 572 " AccumOffset: " + 573 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false); 574 575 OutStreamer->emitRawComment( 576 " Occupancy: " + 577 Twine(CurrentProgramInfo.Occupancy), false); 578 579 OutStreamer->emitRawComment( 580 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 581 582 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " + 583 Twine(CurrentProgramInfo.ScratchEnable), 584 false); 585 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 586 Twine(CurrentProgramInfo.UserSGPR), 587 false); 588 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 589 Twine(CurrentProgramInfo.TrapHandlerEnable), 590 false); 591 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 592 Twine(CurrentProgramInfo.TGIdXEnable), 593 false); 594 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 595 Twine(CurrentProgramInfo.TGIdYEnable), 596 false); 597 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 598 Twine(CurrentProgramInfo.TGIdZEnable), 599 false); 600 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 601 Twine(CurrentProgramInfo.TIdIGCompCount), 602 false); 603 604 assert(STM.hasGFX90AInsts() || 605 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); 606 if (STM.hasGFX90AInsts()) { 607 OutStreamer->emitRawComment( 608 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + 609 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, 610 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), 611 false); 612 OutStreamer->emitRawComment( 613 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + 614 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, 615 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), 616 false); 617 } 618 } 619 620 if (DumpCodeInstEmitter) { 621 622 OutStreamer->switchSection( 623 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 624 625 for (size_t i = 0; i < DisasmLines.size(); ++i) { 626 std::string Comment = "\n"; 627 if (!HexLines[i].empty()) { 628 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 629 Comment += " ; " + HexLines[i] + "\n"; 630 } 631 632 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 633 OutStreamer->emitBytes(StringRef(Comment)); 634 } 635 } 636 637 return false; 638 } 639 640 // TODO: Fold this into emitFunctionBodyStart. 641 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { 642 // In the beginning all features are either 'Any' or 'NotSupported', 643 // depending on global target features. This will cover empty modules. 644 getTargetStreamer()->initializeTargetID( 645 *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion); 646 647 // If module is empty, we are done. 648 if (M.empty()) 649 return; 650 651 // If module is not empty, need to find first 'Off' or 'On' feature 652 // setting per feature from functions in module. 653 for (auto &F : M) { 654 auto &TSTargetID = getTargetStreamer()->getTargetID(); 655 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && 656 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) 657 break; 658 659 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 660 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); 661 if (TSTargetID->isXnackSupported()) 662 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) 663 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); 664 if (TSTargetID->isSramEccSupported()) 665 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) 666 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); 667 } 668 } 669 670 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 671 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 672 const SIInstrInfo *TII = STM.getInstrInfo(); 673 674 uint64_t CodeSize = 0; 675 676 for (const MachineBasicBlock &MBB : MF) { 677 for (const MachineInstr &MI : MBB) { 678 // TODO: CodeSize should account for multiple functions. 679 680 // TODO: Should we count size of debug info? 681 if (MI.isDebugInstr()) 682 continue; 683 684 CodeSize += TII->getInstSizeInBytes(MI); 685 } 686 } 687 688 return CodeSize; 689 } 690 691 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 692 const MachineFunction &MF) { 693 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 694 ResourceUsage->getResourceInfo(&MF.getFunction()); 695 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 696 697 ProgInfo.NumArchVGPR = Info.NumVGPR; 698 ProgInfo.NumAccVGPR = Info.NumAGPR; 699 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); 700 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; 701 ProgInfo.TgSplit = STM.isTgSplitEnabled(); 702 ProgInfo.NumSGPR = Info.NumExplicitSGPR; 703 ProgInfo.ScratchSize = Info.PrivateSegmentSize; 704 ProgInfo.VCCUsed = Info.UsesVCC; 705 ProgInfo.FlatUsed = Info.UsesFlatScratch; 706 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; 707 708 const uint64_t MaxScratchPerWorkitem = 709 STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); 710 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { 711 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), 712 ProgInfo.ScratchSize, 713 MaxScratchPerWorkitem, DS_Error); 714 MF.getFunction().getContext().diagnose(DiagStackSize); 715 } 716 717 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 718 719 // The calculations related to SGPR/VGPR blocks are 720 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 721 // unified. 722 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( 723 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, 724 getTargetStreamer()->getTargetID()->isXnackOnOrAny()); 725 726 // Check the addressable register limit before we add ExtraSGPRs. 727 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 728 !STM.hasSGPRInitBug()) { 729 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 730 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 731 // This can happen due to a compiler bug or when using inline asm. 732 LLVMContext &Ctx = MF.getFunction().getContext(); 733 DiagnosticInfoResourceLimit Diag( 734 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR, 735 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); 736 Ctx.diagnose(Diag); 737 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; 738 } 739 } 740 741 // Account for extra SGPRs and VGPRs reserved for debugger use. 742 ProgInfo.NumSGPR += ExtraSGPRs; 743 744 const Function &F = MF.getFunction(); 745 746 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 747 // dispatch registers are function args. 748 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 749 750 if (isShader(F.getCallingConv())) { 751 bool IsPixelShader = 752 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); 753 754 // Calculate the number of VGPR registers based on the SPI input registers 755 uint32_t InputEna = 0; 756 uint32_t InputAddr = 0; 757 unsigned LastEna = 0; 758 759 if (IsPixelShader) { 760 // Note for IsPixelShader: 761 // By this stage, all enabled inputs are tagged in InputAddr as well. 762 // We will use InputAddr to determine whether the input counts against the 763 // vgpr total and only use the InputEnable to determine the last input 764 // that is relevant - if extra arguments are used, then we have to honour 765 // the InputAddr for any intermediate non-enabled inputs. 766 InputEna = MFI->getPSInputEnable(); 767 InputAddr = MFI->getPSInputAddr(); 768 769 // We only need to consider input args up to the last used arg. 770 assert((InputEna || InputAddr) && 771 "PSInputAddr and PSInputEnable should " 772 "never both be 0 for AMDGPU_PS shaders"); 773 // There are some rare circumstances where InputAddr is non-zero and 774 // InputEna can be set to 0. In this case we default to setting LastEna 775 // to 1. 776 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; 777 } 778 779 // FIXME: We should be using the number of registers determined during 780 // calling convention lowering to legalize the types. 781 const DataLayout &DL = F.getParent()->getDataLayout(); 782 unsigned PSArgCount = 0; 783 unsigned IntermediateVGPR = 0; 784 for (auto &Arg : F.args()) { 785 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; 786 if (Arg.hasAttribute(Attribute::InReg)) { 787 WaveDispatchNumSGPR += NumRegs; 788 } else { 789 // If this is a PS shader and we're processing the PS Input args (first 790 // 16 VGPR), use the InputEna and InputAddr bits to define how many 791 // VGPRs are actually used. 792 // Any extra VGPR arguments are handled as normal arguments (and 793 // contribute to the VGPR count whether they're used or not). 794 if (IsPixelShader && PSArgCount < 16) { 795 if ((1 << PSArgCount) & InputAddr) { 796 if (PSArgCount < LastEna) 797 WaveDispatchNumVGPR += NumRegs; 798 else 799 IntermediateVGPR += NumRegs; 800 } 801 PSArgCount++; 802 } else { 803 // If there are extra arguments we have to include the allocation for 804 // the non-used (but enabled with InputAddr) input arguments 805 if (IntermediateVGPR) { 806 WaveDispatchNumVGPR += IntermediateVGPR; 807 IntermediateVGPR = 0; 808 } 809 WaveDispatchNumVGPR += NumRegs; 810 } 811 } 812 } 813 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); 814 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); 815 ProgInfo.NumVGPR = 816 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR); 817 } 818 819 // Adjust number of registers used to meet default/requested minimum/maximum 820 // number of waves per execution unit request. 821 ProgInfo.NumSGPRsForWavesPerEU = std::max( 822 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); 823 ProgInfo.NumVGPRsForWavesPerEU = std::max( 824 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); 825 826 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 827 STM.hasSGPRInitBug()) { 828 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 829 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 830 // This can happen due to a compiler bug or when using inline asm to use 831 // the registers which are usually reserved for vcc etc. 832 LLVMContext &Ctx = MF.getFunction().getContext(); 833 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", 834 ProgInfo.NumSGPR, MaxAddressableNumSGPRs, 835 DS_Error, DK_ResourceLimit); 836 Ctx.diagnose(Diag); 837 ProgInfo.NumSGPR = MaxAddressableNumSGPRs; 838 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; 839 } 840 } 841 842 if (STM.hasSGPRInitBug()) { 843 ProgInfo.NumSGPR = 844 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 845 ProgInfo.NumSGPRsForWavesPerEU = 846 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 847 } 848 849 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 850 LLVMContext &Ctx = MF.getFunction().getContext(); 851 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 852 MFI->getNumUserSGPRs(), 853 STM.getMaxNumUserSGPRs(), DS_Error); 854 Ctx.diagnose(Diag); 855 } 856 857 if (MFI->getLDSSize() > 858 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { 859 LLVMContext &Ctx = MF.getFunction().getContext(); 860 DiagnosticInfoResourceLimit Diag( 861 MF.getFunction(), "local memory", MFI->getLDSSize(), 862 STM.getAddressableLocalMemorySize(), DS_Error); 863 Ctx.diagnose(Diag); 864 } 865 866 ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( 867 &STM, ProgInfo.NumSGPRsForWavesPerEU); 868 ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( 869 &STM, ProgInfo.NumVGPRsForWavesPerEU); 870 871 const SIModeRegisterDefaults Mode = MFI->getMode(); 872 873 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 874 // register. 875 ProgInfo.FloatMode = getFPMode(Mode); 876 877 ProgInfo.IEEEMode = Mode.IEEE; 878 879 // Make clamp modifier on NaN input returns 0. 880 ProgInfo.DX10Clamp = Mode.DX10Clamp; 881 882 unsigned LDSAlignShift; 883 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 884 // LDS is allocated in 64 dword blocks. 885 LDSAlignShift = 8; 886 } else { 887 // LDS is allocated in 128 dword blocks. 888 LDSAlignShift = 9; 889 } 890 891 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); 892 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); 893 894 ProgInfo.LDSSize = MFI->getLDSSize(); 895 ProgInfo.LDSBlocks = 896 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 897 898 // Scratch is allocated in 64-dword or 256-dword blocks. 899 unsigned ScratchAlignShift = 900 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; 901 // We need to program the hardware with the amount of scratch memory that 902 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 903 // scratch memory used per thread. 904 ProgInfo.ScratchBlocks = divideCeil( 905 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); 906 907 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 908 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 909 ProgInfo.MemOrdered = 1; 910 } 911 912 // 0 = X, 1 = XY, 2 = XYZ 913 unsigned TIDIGCompCnt = 0; 914 if (MFI->hasWorkItemIDZ()) 915 TIDIGCompCnt = 2; 916 else if (MFI->hasWorkItemIDY()) 917 TIDIGCompCnt = 1; 918 919 // The private segment wave byte offset is the last of the system SGPRs. We 920 // initially assumed it was allocated, and may have used it. It shouldn't harm 921 // anything to disable it if we know the stack isn't used here. We may still 922 // have emitted code reading it to initialize scratch, but if that's unused 923 // reading garbage should be OK. 924 ProgInfo.ScratchEnable = 925 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; 926 ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); 927 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 928 ProgInfo.TrapHandlerEnable = 929 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); 930 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); 931 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); 932 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); 933 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); 934 ProgInfo.TIdIGCompCount = TIDIGCompCnt; 935 ProgInfo.EXCPEnMSB = 0; 936 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 937 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; 938 ProgInfo.EXCPEnable = 0; 939 940 if (STM.hasGFX90AInsts()) { 941 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, 942 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, 943 ProgInfo.AccumOffset); 944 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, 945 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, 946 ProgInfo.TgSplit); 947 } 948 949 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, 950 ProgInfo.NumSGPRsForWavesPerEU, 951 ProgInfo.NumVGPRsForWavesPerEU); 952 } 953 954 static unsigned getRsrcReg(CallingConv::ID CallConv) { 955 switch (CallConv) { 956 default: [[fallthrough]]; 957 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 958 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 959 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 960 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 961 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 962 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 963 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 964 } 965 } 966 967 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 968 const SIProgramInfo &CurrentProgramInfo) { 969 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 970 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 971 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 972 973 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 974 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 975 976 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); 977 978 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 979 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2()); 980 981 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 982 OutStreamer->emitInt32( 983 STM.getGeneration() >= AMDGPUSubtarget::GFX11 984 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) 985 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); 986 987 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 988 // 0" comment but I don't see a corresponding field in the register spec. 989 } else { 990 OutStreamer->emitInt32(RsrcReg); 991 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | 992 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); 993 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 994 OutStreamer->emitInt32( 995 STM.getGeneration() >= AMDGPUSubtarget::GFX11 996 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) 997 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); 998 } 999 1000 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1001 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 1002 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1003 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1004 : CurrentProgramInfo.LDSBlocks; 1005 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1006 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 1007 OutStreamer->emitInt32(MFI->getPSInputEnable()); 1008 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 1009 OutStreamer->emitInt32(MFI->getPSInputAddr()); 1010 } 1011 1012 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1013 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1014 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1015 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1016 } 1017 1018 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1019 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1020 // metadata items into the PALMD::Metadata, combining with any provided by the 1021 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1022 // is then written as a single block in the .note section. 1023 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1024 const SIProgramInfo &CurrentProgramInfo) { 1025 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1026 auto CC = MF.getFunction().getCallingConv(); 1027 auto MD = getTargetStreamer()->getPALMetadata(); 1028 1029 MD->setEntryPoint(CC, MF.getFunction().getName()); 1030 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1031 1032 // Only set AGPRs for supported devices 1033 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1034 if (STM.hasMAIInsts()) { 1035 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); 1036 } 1037 1038 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1039 if (MD->getPALMajorVersion() < 3) { 1040 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); 1041 if (AMDGPU::isCompute(CC)) { 1042 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); 1043 } else { 1044 if (CurrentProgramInfo.ScratchBlocks > 0) 1045 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); 1046 } 1047 } else { 1048 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); 1049 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); 1050 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); 1051 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); 1052 1053 if (AMDGPU::isCompute(CC)) { 1054 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); 1055 MD->setHwStage(CC, ".trap_present", 1056 (bool)CurrentProgramInfo.TrapHandlerEnable); 1057 1058 // EXCPEnMSB? 1059 const unsigned LdsDwGranularity = 128; 1060 MD->setHwStage(CC, ".lds_size", 1061 (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity * 1062 sizeof(uint32_t))); 1063 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); 1064 } else { 1065 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); 1066 } 1067 } 1068 1069 // ScratchSize is in bytes, 16 aligned. 1070 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); 1071 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1072 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1073 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1074 : CurrentProgramInfo.LDSBlocks; 1075 if (MD->getPALMajorVersion() < 3) { 1076 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1077 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1078 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1079 } else { 1080 // Graphics registers 1081 const unsigned ExtraLdsDwGranularity = 1082 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; 1083 MD->setGraphicsRegisters( 1084 ".ps_extra_lds_size", 1085 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); 1086 1087 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr 1088 static StringLiteral const PsInputFields[] = { 1089 ".persp_sample_ena", ".persp_center_ena", 1090 ".persp_centroid_ena", ".persp_pull_model_ena", 1091 ".linear_sample_ena", ".linear_center_ena", 1092 ".linear_centroid_ena", ".line_stipple_tex_ena", 1093 ".pos_x_float_ena", ".pos_y_float_ena", 1094 ".pos_z_float_ena", ".pos_w_float_ena", 1095 ".front_face_ena", ".ancillary_ena", 1096 ".sample_coverage_ena", ".pos_fixed_pt_ena"}; 1097 unsigned PSInputEna = MFI->getPSInputEnable(); 1098 unsigned PSInputAddr = MFI->getPSInputAddr(); 1099 for (auto [Idx, Field] : enumerate(PsInputFields)) { 1100 MD->setGraphicsRegisters(".spi_ps_input_ena", Field, 1101 (bool)((PSInputEna >> Idx) & 1)); 1102 MD->setGraphicsRegisters(".spi_ps_input_addr", Field, 1103 (bool)((PSInputAddr >> Idx) & 1)); 1104 } 1105 } 1106 } 1107 1108 // For version 3 and above the wave front size is already set in the metadata 1109 if (MD->getPALMajorVersion() < 3 && STM.isWave32()) 1110 MD->setWave32(MF.getFunction().getCallingConv()); 1111 } 1112 1113 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { 1114 auto *MD = getTargetStreamer()->getPALMetadata(); 1115 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1116 MD->setFunctionScratchSize(MF, MFI.getStackSize()); 1117 1118 // Set compute registers 1119 MD->setRsrc1(CallingConv::AMDGPU_CS, 1120 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); 1121 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2()); 1122 1123 // Set optional info 1124 MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); 1125 MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1126 MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1127 } 1128 1129 // This is supposed to be log2(Size) 1130 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1131 switch (Size) { 1132 case 4: 1133 return AMD_ELEMENT_4_BYTES; 1134 case 8: 1135 return AMD_ELEMENT_8_BYTES; 1136 case 16: 1137 return AMD_ELEMENT_16_BYTES; 1138 default: 1139 llvm_unreachable("invalid private_element_size"); 1140 } 1141 } 1142 1143 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, 1144 const SIProgramInfo &CurrentProgramInfo, 1145 const MachineFunction &MF) const { 1146 const Function &F = MF.getFunction(); 1147 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1148 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1149 1150 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1151 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1152 1153 AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); 1154 1155 Out.compute_pgm_resource_registers = 1156 CurrentProgramInfo.getComputePGMRSrc1() | 1157 (CurrentProgramInfo.getComputePGMRSrc2() << 32); 1158 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1159 1160 if (CurrentProgramInfo.DynamicCallStack) 1161 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; 1162 1163 AMD_HSA_BITS_SET(Out.code_properties, 1164 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1165 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1166 1167 if (MFI->hasPrivateSegmentBuffer()) { 1168 Out.code_properties |= 1169 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1170 } 1171 1172 if (MFI->hasDispatchPtr()) 1173 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1174 1175 if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) 1176 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1177 1178 if (MFI->hasKernargSegmentPtr()) 1179 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1180 1181 if (MFI->hasDispatchID()) 1182 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1183 1184 if (MFI->hasFlatScratchInit()) 1185 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1186 1187 if (MFI->hasDispatchPtr()) 1188 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1189 1190 if (STM.isXNACKEnabled()) 1191 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1192 1193 Align MaxKernArgAlign; 1194 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1195 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1196 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1197 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1198 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1199 1200 // kernarg_segment_alignment is specified as log of the alignment. 1201 // The minimum alignment is 16. 1202 // FIXME: The metadata treats the minimum as 4? 1203 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1204 } 1205 1206 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1207 const char *ExtraCode, raw_ostream &O) { 1208 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1209 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1210 return false; 1211 1212 if (ExtraCode && ExtraCode[0]) { 1213 if (ExtraCode[1] != 0) 1214 return true; // Unknown modifier. 1215 1216 switch (ExtraCode[0]) { 1217 case 'r': 1218 break; 1219 default: 1220 return true; 1221 } 1222 } 1223 1224 // TODO: Should be able to support other operand types like globals. 1225 const MachineOperand &MO = MI->getOperand(OpNo); 1226 if (MO.isReg()) { 1227 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1228 *MF->getSubtarget().getRegisterInfo()); 1229 return false; 1230 } else if (MO.isImm()) { 1231 int64_t Val = MO.getImm(); 1232 if (AMDGPU::isInlinableIntLiteral(Val)) { 1233 O << Val; 1234 } else if (isUInt<16>(Val)) { 1235 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1236 } else if (isUInt<32>(Val)) { 1237 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1238 } else { 1239 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1240 } 1241 return false; 1242 } 1243 return true; 1244 } 1245 1246 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { 1247 AU.addRequired<AMDGPUResourceUsageAnalysis>(); 1248 AU.addPreserved<AMDGPUResourceUsageAnalysis>(); 1249 AsmPrinter::getAnalysisUsage(AU); 1250 } 1251 1252 void AMDGPUAsmPrinter::emitResourceUsageRemarks( 1253 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, 1254 bool isModuleEntryFunction, bool hasMAIInsts) { 1255 if (!ORE) 1256 return; 1257 1258 const char *Name = "kernel-resource-usage"; 1259 const char *Indent = " "; 1260 1261 // If the remark is not specifically enabled, do not output to yaml 1262 LLVMContext &Ctx = MF.getFunction().getContext(); 1263 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) 1264 return; 1265 1266 auto EmitResourceUsageRemark = [&](StringRef RemarkName, 1267 StringRef RemarkLabel, auto Argument) { 1268 // Add an indent for every line besides the line with the kernel name. This 1269 // makes it easier to tell which resource usage go with which kernel since 1270 // the kernel name will always be displayed first. 1271 std::string LabelStr = RemarkLabel.str() + ": "; 1272 if (!RemarkName.equals("FunctionName")) 1273 LabelStr = Indent + LabelStr; 1274 1275 ORE->emit([&]() { 1276 return MachineOptimizationRemarkAnalysis(Name, RemarkName, 1277 MF.getFunction().getSubprogram(), 1278 &MF.front()) 1279 << LabelStr << ore::NV(RemarkName, Argument); 1280 }); 1281 }; 1282 1283 // FIXME: Formatting here is pretty nasty because clang does not accept 1284 // newlines from diagnostics. This forces us to emit multiple diagnostic 1285 // remarks to simulate newlines. If and when clang does accept newlines, this 1286 // formatting should be aggregated into one remark with newlines to avoid 1287 // printing multiple diagnostic location and diag opts. 1288 EmitResourceUsageRemark("FunctionName", "Function Name", 1289 MF.getFunction().getName()); 1290 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR); 1291 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR); 1292 if (hasMAIInsts) 1293 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR); 1294 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", 1295 CurrentProgramInfo.ScratchSize); 1296 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", 1297 CurrentProgramInfo.Occupancy); 1298 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", 1299 CurrentProgramInfo.SGPRSpill); 1300 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", 1301 CurrentProgramInfo.VGPRSpill); 1302 if (isModuleEntryFunction) 1303 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", 1304 CurrentProgramInfo.LDSSize); 1305 } 1306