1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUHSAMetadataStreamer.h" 21 #include "AMDGPUResourceUsageAnalysis.h" 22 #include "AMDKernelCodeT.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUInstPrinter.h" 25 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 26 #include "R600AsmPrinter.h" 27 #include "SIMachineFunctionInfo.h" 28 #include "TargetInfo/AMDGPUTargetInfo.h" 29 #include "Utils/AMDGPUBaseInfo.h" 30 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 31 #include "llvm/BinaryFormat/ELF.h" 32 #include "llvm/CodeGen/MachineFrameInfo.h" 33 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/MC/MCAssembler.h" 36 #include "llvm/MC/MCContext.h" 37 #include "llvm/MC/MCSectionELF.h" 38 #include "llvm/MC/MCStreamer.h" 39 #include "llvm/MC/TargetRegistry.h" 40 #include "llvm/Support/AMDHSAKernelDescriptor.h" 41 #include "llvm/Support/TargetParser.h" 42 #include "llvm/Target/TargetLoweringObjectFile.h" 43 #include "llvm/Target/TargetMachine.h" 44 45 using namespace llvm; 46 using namespace llvm::AMDGPU; 47 48 // This should get the default rounding mode from the kernel. We just set the 49 // default here, but this could change if the OpenCL rounding mode pragmas are 50 // used. 51 // 52 // The denormal mode here should match what is reported by the OpenCL runtime 53 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 54 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 55 // 56 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 57 // precision, and leaves single precision to flush all and does not report 58 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 59 // CL_FP_DENORM for both. 60 // 61 // FIXME: It seems some instructions do not support single precision denormals 62 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 63 // and sin_f32, cos_f32 on most parts). 64 65 // We want to use these instructions, and using fp32 denormals also causes 66 // instructions to run at the double precision rate for the device so it's 67 // probably best to just report no single precision denormals. 68 static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { 69 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 70 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 73 } 74 75 static AsmPrinter * 76 createAMDGPUAsmPrinterPass(TargetMachine &tm, 77 std::unique_ptr<MCStreamer> &&Streamer) { 78 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 79 } 80 81 extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() { 82 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), 83 llvm::createR600AsmPrinterPass); 84 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 85 createAMDGPUAsmPrinterPass); 86 } 87 88 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 89 std::unique_ptr<MCStreamer> Streamer) 90 : AsmPrinter(TM, std::move(Streamer)) { 91 assert(OutStreamer && "AsmPrinter constructed without streamer"); 92 93 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 94 if (isHsaAbiVersion2(getGlobalSTI())) { 95 HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2()); 96 } else if (isHsaAbiVersion3(getGlobalSTI())) { 97 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); 98 } else if (isHsaAbiVersion5(getGlobalSTI())) { 99 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5()); 100 } else { 101 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); 102 } 103 } 104 } 105 106 StringRef AMDGPUAsmPrinter::getPassName() const { 107 return "AMDGPU Assembly Printer"; 108 } 109 110 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 111 return TM.getMCSubtargetInfo(); 112 } 113 114 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 115 if (!OutStreamer) 116 return nullptr; 117 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 118 } 119 120 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 121 IsTargetStreamerInitialized = false; 122 } 123 124 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { 125 IsTargetStreamerInitialized = true; 126 127 // TODO: Which one is called first, emitStartOfAsmFile or 128 // emitFunctionBodyStart? 129 if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) 130 initializeTargetID(M); 131 132 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 133 TM.getTargetTriple().getOS() != Triple::AMDPAL) 134 return; 135 136 if (isHsaAbiVersion3AndAbove(getGlobalSTI())) 137 getTargetStreamer()->EmitDirectiveAMDGCNTarget(); 138 139 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 140 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); 141 142 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 143 getTargetStreamer()->getPALMetadata()->readFromIR(M); 144 145 if (isHsaAbiVersion3AndAbove(getGlobalSTI())) 146 return; 147 148 // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. 149 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) 150 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); 151 152 // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2. 153 IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU()); 154 getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2( 155 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); 156 } 157 158 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 159 // Init target streamer if it has not yet happened 160 if (!IsTargetStreamerInitialized) 161 initTargetStreamer(M); 162 163 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 164 isHsaAbiVersion2(getGlobalSTI())) 165 getTargetStreamer()->EmitISAVersion(); 166 167 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 168 // Emit HSA Metadata (NT_AMD_HSA_METADATA). 169 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 170 HSAMetadataStream->end(); 171 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 172 (void)Success; 173 assert(Success && "Malformed HSA Metadata"); 174 } 175 } 176 177 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( 178 const MachineBasicBlock *MBB) const { 179 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB)) 180 return false; 181 182 if (MBB->empty()) 183 return true; 184 185 // If this is a block implementing a long branch, an expression relative to 186 // the start of the block is needed. to the start of the block. 187 // XXX - Is there a smarter way to check this? 188 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64); 189 } 190 191 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 192 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 193 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 194 const Function &F = MF->getFunction(); 195 196 // TODO: Which one is called first, emitStartOfAsmFile or 197 // emitFunctionBodyStart? 198 if (!getTargetStreamer()->getTargetID()) 199 initializeTargetID(*F.getParent()); 200 201 const auto &FunctionTargetID = STM.getTargetID(); 202 // Make sure function's xnack settings are compatible with module's 203 // xnack settings. 204 if (FunctionTargetID.isXnackSupported() && 205 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && 206 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { 207 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + 208 "' function does not match module xnack setting"); 209 return; 210 } 211 // Make sure function's sramecc settings are compatible with module's 212 // sramecc settings. 213 if (FunctionTargetID.isSramEccSupported() && 214 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && 215 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { 216 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + 217 "' function does not match module sramecc setting"); 218 return; 219 } 220 221 if (!MFI.isEntryFunction()) 222 return; 223 224 if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) && 225 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 226 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 227 amd_kernel_code_t KernelCode; 228 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 229 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 230 } 231 232 if (STM.isAmdHsaOS()) 233 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 234 } 235 236 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 237 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 238 if (!MFI.isEntryFunction()) 239 return; 240 241 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || 242 isHsaAbiVersion2(getGlobalSTI())) 243 return; 244 245 auto &Streamer = getTargetStreamer()->getStreamer(); 246 auto &Context = Streamer.getContext(); 247 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 248 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 249 250 Streamer.pushSection(); 251 Streamer.switchSection(&ReadOnlySection); 252 253 // CP microcode requires the kernel descriptor to be allocated on 64 byte 254 // alignment. 255 Streamer.emitValueToAlignment(Align(64), 0, 1, 0); 256 ReadOnlySection.ensureMinAlignment(Align(64)); 257 258 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 259 260 SmallString<128> KernelName; 261 getNameWithPrefix(KernelName, &MF->getFunction()); 262 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 263 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 264 CurrentProgramInfo.NumVGPRsForWavesPerEU, 265 CurrentProgramInfo.NumSGPRsForWavesPerEU - 266 IsaInfo::getNumExtraSGPRs(&STM, 267 CurrentProgramInfo.VCCUsed, 268 CurrentProgramInfo.FlatUsed), 269 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); 270 271 Streamer.popSection(); 272 } 273 274 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 275 if (TM.getTargetTriple().getOS() == Triple::AMDHSA && 276 isHsaAbiVersion3AndAbove(getGlobalSTI())) { 277 AsmPrinter::emitFunctionEntryLabel(); 278 return; 279 } 280 281 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 282 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 283 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 284 SmallString<128> SymbolName; 285 getNameWithPrefix(SymbolName, &MF->getFunction()), 286 getTargetStreamer()->EmitAMDGPUSymbolType( 287 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 288 } 289 if (DumpCodeInstEmitter) { 290 // Disassemble function name label to text. 291 DisasmLines.push_back(MF->getName().str() + ":"); 292 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 293 HexLines.push_back(""); 294 } 295 296 AsmPrinter::emitFunctionEntryLabel(); 297 } 298 299 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 300 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 301 // Write a line for the basic block label if it is not only fallthrough. 302 DisasmLines.push_back( 303 (Twine("BB") + Twine(getFunctionNumber()) 304 + "_" + Twine(MBB.getNumber()) + ":").str()); 305 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 306 HexLines.push_back(""); 307 } 308 AsmPrinter::emitBasicBlockStart(MBB); 309 } 310 311 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 312 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 313 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 314 OutContext.reportError({}, 315 Twine(GV->getName()) + 316 ": unsupported initializer for address space"); 317 return; 318 } 319 320 // LDS variables aren't emitted in HSA or PAL yet. 321 const Triple::OSType OS = TM.getTargetTriple().getOS(); 322 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 323 return; 324 325 MCSymbol *GVSym = getSymbol(GV); 326 327 GVSym->redefineIfPossible(); 328 if (GVSym->isDefined() || GVSym->isVariable()) 329 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 330 "' is already defined"); 331 332 const DataLayout &DL = GV->getParent()->getDataLayout(); 333 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 334 Align Alignment = GV->getAlign().value_or(Align(4)); 335 336 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 337 emitLinkage(GV, GVSym); 338 auto TS = getTargetStreamer(); 339 TS->emitAMDGPULDS(GVSym, Size, Alignment); 340 return; 341 } 342 343 AsmPrinter::emitGlobalVariable(GV); 344 } 345 346 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 347 // Pad with s_code_end to help tools and guard against instruction prefetch 348 // causing stale data in caches. Arguably this should be done by the linker, 349 // which is why this isn't done for Mesa. 350 const MCSubtargetInfo &STI = *getGlobalSTI(); 351 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && 352 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 353 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 354 OutStreamer->switchSection(getObjFileLowering().getTextSection()); 355 getTargetStreamer()->EmitCodeEnd(STI); 356 } 357 358 return AsmPrinter::doFinalization(M); 359 } 360 361 // Print comments that apply to both callable functions and entry points. 362 void AMDGPUAsmPrinter::emitCommonFunctionComments( 363 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR, 364 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, 365 const AMDGPUMachineFunction *MFI) { 366 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 367 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); 368 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); 369 if (NumAGPR) { 370 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); 371 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), 372 false); 373 } 374 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); 375 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 376 false); 377 } 378 379 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 380 const MachineFunction &MF) const { 381 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 382 uint16_t KernelCodeProperties = 0; 383 384 if (MFI.hasPrivateSegmentBuffer()) { 385 KernelCodeProperties |= 386 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 387 } 388 if (MFI.hasDispatchPtr()) { 389 KernelCodeProperties |= 390 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 391 } 392 if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { 393 KernelCodeProperties |= 394 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 395 } 396 if (MFI.hasKernargSegmentPtr()) { 397 KernelCodeProperties |= 398 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 399 } 400 if (MFI.hasDispatchID()) { 401 KernelCodeProperties |= 402 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 403 } 404 if (MFI.hasFlatScratchInit()) { 405 KernelCodeProperties |= 406 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 407 } 408 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 409 KernelCodeProperties |= 410 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 411 } 412 413 if (CurrentProgramInfo.DynamicCallStack && 414 AMDGPU::getAmdhsaCodeObjectVersion() >= 5) { 415 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; 416 } 417 418 return KernelCodeProperties; 419 } 420 421 amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( 422 const MachineFunction &MF, 423 const SIProgramInfo &PI) const { 424 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 425 const Function &F = MF.getFunction(); 426 427 amdhsa::kernel_descriptor_t KernelDescriptor; 428 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); 429 430 assert(isUInt<32>(PI.ScratchSize)); 431 assert(isUInt<32>(PI.getComputePGMRSrc1())); 432 assert(isUInt<32>(PI.ComputePGMRSrc2)); 433 434 KernelDescriptor.group_segment_fixed_size = PI.LDSSize; 435 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 436 437 Align MaxKernArgAlign; 438 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 439 440 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); 441 KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; 442 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 443 444 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); 445 if (STM.hasGFX90AInsts()) 446 KernelDescriptor.compute_pgm_rsrc3 = 447 CurrentProgramInfo.ComputePGMRSrc3GFX90A; 448 449 return KernelDescriptor; 450 } 451 452 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 453 // Init target streamer lazily on the first function so that previous passes 454 // can set metadata. 455 if (!IsTargetStreamerInitialized) 456 initTargetStreamer(*MF.getFunction().getParent()); 457 458 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); 459 CurrentProgramInfo = SIProgramInfo(); 460 461 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 462 463 // The starting address of all shader programs must be 256 bytes aligned. 464 // Regular functions just need the basic required instruction alignment. 465 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 466 467 SetupMachineFunction(MF); 468 469 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 470 MCContext &Context = getObjFileLowering().getContext(); 471 // FIXME: This should be an explicit check for Mesa. 472 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 473 MCSectionELF *ConfigSection = 474 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 475 OutStreamer->switchSection(ConfigSection); 476 } 477 478 if (MFI->isModuleEntryFunction()) { 479 getSIProgramInfo(CurrentProgramInfo, MF); 480 } 481 482 if (STM.isAmdPalOS()) { 483 if (MFI->isEntryFunction()) 484 EmitPALMetadata(MF, CurrentProgramInfo); 485 else if (MFI->isModuleEntryFunction()) 486 emitPALFunctionMetadata(MF); 487 } else if (!STM.isAmdHsaOS()) { 488 EmitProgramInfoSI(MF, CurrentProgramInfo); 489 } 490 491 DumpCodeInstEmitter = nullptr; 492 if (STM.dumpCode()) { 493 // For -dumpcode, get the assembler out of the streamer, even if it does 494 // not really want to let us have it. This only works with -filetype=obj. 495 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing(); 496 OutStreamer->setUseAssemblerInfoForParsing(true); 497 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 498 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag); 499 if (Assembler) 500 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 501 } 502 503 DisasmLines.clear(); 504 HexLines.clear(); 505 DisasmLineMaxLen = 0; 506 507 emitFunctionBody(); 508 509 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), 510 STM.hasMAIInsts()); 511 512 if (isVerbose()) { 513 MCSectionELF *CommentSection = 514 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 515 OutStreamer->switchSection(CommentSection); 516 517 if (!MFI->isEntryFunction()) { 518 OutStreamer->emitRawComment(" Function info:", false); 519 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 520 ResourceUsage->getResourceInfo(&MF.getFunction()); 521 emitCommonFunctionComments( 522 Info.NumVGPR, 523 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(), 524 Info.getTotalNumVGPRs(STM), 525 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), 526 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); 527 return false; 528 } 529 530 OutStreamer->emitRawComment(" Kernel info:", false); 531 emitCommonFunctionComments( 532 CurrentProgramInfo.NumArchVGPR, 533 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR 534 : std::optional<uint32_t>(), 535 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, 536 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); 537 538 OutStreamer->emitRawComment( 539 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 540 OutStreamer->emitRawComment( 541 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 542 OutStreamer->emitRawComment( 543 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 544 " bytes/workgroup (compile time only)", false); 545 546 OutStreamer->emitRawComment( 547 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); 548 OutStreamer->emitRawComment( 549 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); 550 551 OutStreamer->emitRawComment( 552 " NumSGPRsForWavesPerEU: " + 553 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); 554 OutStreamer->emitRawComment( 555 " NumVGPRsForWavesPerEU: " + 556 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); 557 558 if (STM.hasGFX90AInsts()) 559 OutStreamer->emitRawComment( 560 " AccumOffset: " + 561 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false); 562 563 OutStreamer->emitRawComment( 564 " Occupancy: " + 565 Twine(CurrentProgramInfo.Occupancy), false); 566 567 OutStreamer->emitRawComment( 568 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 569 570 OutStreamer->emitRawComment( 571 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + 572 Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 573 OutStreamer->emitRawComment( 574 " COMPUTE_PGM_RSRC2:USER_SGPR: " + 575 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); 576 OutStreamer->emitRawComment( 577 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 578 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); 579 OutStreamer->emitRawComment( 580 " COMPUTE_PGM_RSRC2:TGID_X_EN: " + 581 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 582 OutStreamer->emitRawComment( 583 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 584 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 585 OutStreamer->emitRawComment( 586 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 587 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); 588 OutStreamer->emitRawComment( 589 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 590 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), 591 false); 592 593 assert(STM.hasGFX90AInsts() || 594 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); 595 if (STM.hasGFX90AInsts()) { 596 OutStreamer->emitRawComment( 597 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + 598 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, 599 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))), 600 false); 601 OutStreamer->emitRawComment( 602 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + 603 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A, 604 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))), 605 false); 606 } 607 } 608 609 if (DumpCodeInstEmitter) { 610 611 OutStreamer->switchSection( 612 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 613 614 for (size_t i = 0; i < DisasmLines.size(); ++i) { 615 std::string Comment = "\n"; 616 if (!HexLines[i].empty()) { 617 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 618 Comment += " ; " + HexLines[i] + "\n"; 619 } 620 621 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 622 OutStreamer->emitBytes(StringRef(Comment)); 623 } 624 } 625 626 return false; 627 } 628 629 // TODO: Fold this into emitFunctionBodyStart. 630 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { 631 // In the beginning all features are either 'Any' or 'NotSupported', 632 // depending on global target features. This will cover empty modules. 633 getTargetStreamer()->initializeTargetID( 634 *getGlobalSTI(), getGlobalSTI()->getFeatureString()); 635 636 // If module is empty, we are done. 637 if (M.empty()) 638 return; 639 640 // If module is not empty, need to find first 'Off' or 'On' feature 641 // setting per feature from functions in module. 642 for (auto &F : M) { 643 auto &TSTargetID = getTargetStreamer()->getTargetID(); 644 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && 645 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) 646 break; 647 648 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 649 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); 650 if (TSTargetID->isXnackSupported()) 651 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) 652 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); 653 if (TSTargetID->isSramEccSupported()) 654 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) 655 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); 656 } 657 } 658 659 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 660 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 661 const SIInstrInfo *TII = STM.getInstrInfo(); 662 663 uint64_t CodeSize = 0; 664 665 for (const MachineBasicBlock &MBB : MF) { 666 for (const MachineInstr &MI : MBB) { 667 // TODO: CodeSize should account for multiple functions. 668 669 // TODO: Should we count size of debug info? 670 if (MI.isDebugInstr()) 671 continue; 672 673 CodeSize += TII->getInstSizeInBytes(MI); 674 } 675 } 676 677 return CodeSize; 678 } 679 680 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 681 const MachineFunction &MF) { 682 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 683 ResourceUsage->getResourceInfo(&MF.getFunction()); 684 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 685 686 ProgInfo.NumArchVGPR = Info.NumVGPR; 687 ProgInfo.NumAccVGPR = Info.NumAGPR; 688 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM); 689 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1; 690 ProgInfo.TgSplit = STM.isTgSplitEnabled(); 691 ProgInfo.NumSGPR = Info.NumExplicitSGPR; 692 ProgInfo.ScratchSize = Info.PrivateSegmentSize; 693 ProgInfo.VCCUsed = Info.UsesVCC; 694 ProgInfo.FlatUsed = Info.UsesFlatScratch; 695 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; 696 697 const uint64_t MaxScratchPerWorkitem = 698 STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); 699 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) { 700 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), 701 ProgInfo.ScratchSize, 702 MaxScratchPerWorkitem, DS_Error); 703 MF.getFunction().getContext().diagnose(DiagStackSize); 704 } 705 706 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 707 708 // The calculations related to SGPR/VGPR blocks are 709 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 710 // unified. 711 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( 712 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); 713 714 // Check the addressable register limit before we add ExtraSGPRs. 715 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 716 !STM.hasSGPRInitBug()) { 717 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 718 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 719 // This can happen due to a compiler bug or when using inline asm. 720 LLVMContext &Ctx = MF.getFunction().getContext(); 721 DiagnosticInfoResourceLimit Diag( 722 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR, 723 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); 724 Ctx.diagnose(Diag); 725 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; 726 } 727 } 728 729 // Account for extra SGPRs and VGPRs reserved for debugger use. 730 ProgInfo.NumSGPR += ExtraSGPRs; 731 732 const Function &F = MF.getFunction(); 733 734 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 735 // dispatch registers are function args. 736 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 737 738 if (isShader(F.getCallingConv())) { 739 bool IsPixelShader = 740 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); 741 742 // Calculate the number of VGPR registers based on the SPI input registers 743 uint32_t InputEna = 0; 744 uint32_t InputAddr = 0; 745 unsigned LastEna = 0; 746 747 if (IsPixelShader) { 748 // Note for IsPixelShader: 749 // By this stage, all enabled inputs are tagged in InputAddr as well. 750 // We will use InputAddr to determine whether the input counts against the 751 // vgpr total and only use the InputEnable to determine the last input 752 // that is relevant - if extra arguments are used, then we have to honour 753 // the InputAddr for any intermediate non-enabled inputs. 754 InputEna = MFI->getPSInputEnable(); 755 InputAddr = MFI->getPSInputAddr(); 756 757 // We only need to consider input args up to the last used arg. 758 assert((InputEna || InputAddr) && 759 "PSInputAddr and PSInputEnable should " 760 "never both be 0 for AMDGPU_PS shaders"); 761 // There are some rare circumstances where InputAddr is non-zero and 762 // InputEna can be set to 0. In this case we default to setting LastEna 763 // to 1. 764 LastEna = InputEna ? findLastSet(InputEna) + 1 : 1; 765 } 766 767 // FIXME: We should be using the number of registers determined during 768 // calling convention lowering to legalize the types. 769 const DataLayout &DL = F.getParent()->getDataLayout(); 770 unsigned PSArgCount = 0; 771 unsigned IntermediateVGPR = 0; 772 for (auto &Arg : F.args()) { 773 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; 774 if (Arg.hasAttribute(Attribute::InReg)) { 775 WaveDispatchNumSGPR += NumRegs; 776 } else { 777 // If this is a PS shader and we're processing the PS Input args (first 778 // 16 VGPR), use the InputEna and InputAddr bits to define how many 779 // VGPRs are actually used. 780 // Any extra VGPR arguments are handled as normal arguments (and 781 // contribute to the VGPR count whether they're used or not). 782 if (IsPixelShader && PSArgCount < 16) { 783 if ((1 << PSArgCount) & InputAddr) { 784 if (PSArgCount < LastEna) 785 WaveDispatchNumVGPR += NumRegs; 786 else 787 IntermediateVGPR += NumRegs; 788 } 789 PSArgCount++; 790 } else { 791 // If there are extra arguments we have to include the allocation for 792 // the non-used (but enabled with InputAddr) input arguments 793 if (IntermediateVGPR) { 794 WaveDispatchNumVGPR += IntermediateVGPR; 795 IntermediateVGPR = 0; 796 } 797 WaveDispatchNumVGPR += NumRegs; 798 } 799 } 800 } 801 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); 802 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); 803 ProgInfo.NumVGPR = 804 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR); 805 } 806 807 // Adjust number of registers used to meet default/requested minimum/maximum 808 // number of waves per execution unit request. 809 ProgInfo.NumSGPRsForWavesPerEU = std::max( 810 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); 811 ProgInfo.NumVGPRsForWavesPerEU = std::max( 812 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); 813 814 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 815 STM.hasSGPRInitBug()) { 816 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 817 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { 818 // This can happen due to a compiler bug or when using inline asm to use 819 // the registers which are usually reserved for vcc etc. 820 LLVMContext &Ctx = MF.getFunction().getContext(); 821 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", 822 ProgInfo.NumSGPR, MaxAddressableNumSGPRs, 823 DS_Error, DK_ResourceLimit); 824 Ctx.diagnose(Diag); 825 ProgInfo.NumSGPR = MaxAddressableNumSGPRs; 826 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs; 827 } 828 } 829 830 if (STM.hasSGPRInitBug()) { 831 ProgInfo.NumSGPR = 832 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 833 ProgInfo.NumSGPRsForWavesPerEU = 834 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; 835 } 836 837 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 838 LLVMContext &Ctx = MF.getFunction().getContext(); 839 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 840 MFI->getNumUserSGPRs(), 841 STM.getMaxNumUserSGPRs(), DS_Error); 842 Ctx.diagnose(Diag); 843 } 844 845 if (MFI->getLDSSize() > 846 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { 847 LLVMContext &Ctx = MF.getFunction().getContext(); 848 DiagnosticInfoResourceLimit Diag( 849 MF.getFunction(), "local memory", MFI->getLDSSize(), 850 STM.getAddressableLocalMemorySize(), DS_Error); 851 Ctx.diagnose(Diag); 852 } 853 854 ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( 855 &STM, ProgInfo.NumSGPRsForWavesPerEU); 856 ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( 857 &STM, ProgInfo.NumVGPRsForWavesPerEU); 858 859 const SIModeRegisterDefaults Mode = MFI->getMode(); 860 861 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 862 // register. 863 ProgInfo.FloatMode = getFPMode(Mode); 864 865 ProgInfo.IEEEMode = Mode.IEEE; 866 867 // Make clamp modifier on NaN input returns 0. 868 ProgInfo.DX10Clamp = Mode.DX10Clamp; 869 870 unsigned LDSAlignShift; 871 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 872 // LDS is allocated in 64 dword blocks. 873 LDSAlignShift = 8; 874 } else { 875 // LDS is allocated in 128 dword blocks. 876 LDSAlignShift = 9; 877 } 878 879 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); 880 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); 881 882 ProgInfo.LDSSize = MFI->getLDSSize(); 883 ProgInfo.LDSBlocks = 884 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 885 886 // Scratch is allocated in 64-dword or 256-dword blocks. 887 unsigned ScratchAlignShift = 888 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; 889 // We need to program the hardware with the amount of scratch memory that 890 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 891 // scratch memory used per thread. 892 ProgInfo.ScratchBlocks = divideCeil( 893 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift); 894 895 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 896 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 897 ProgInfo.MemOrdered = 1; 898 } 899 900 // 0 = X, 1 = XY, 2 = XYZ 901 unsigned TIDIGCompCnt = 0; 902 if (MFI->hasWorkItemIDZ()) 903 TIDIGCompCnt = 2; 904 else if (MFI->hasWorkItemIDY()) 905 TIDIGCompCnt = 1; 906 907 // The private segment wave byte offset is the last of the system SGPRs. We 908 // initially assumed it was allocated, and may have used it. It shouldn't harm 909 // anything to disable it if we know the stack isn't used here. We may still 910 // have emitted code reading it to initialize scratch, but if that's unused 911 // reading garbage should be OK. 912 const bool EnablePrivateSegment = 913 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; 914 ProgInfo.ComputePGMRSrc2 = 915 S_00B84C_SCRATCH_EN(EnablePrivateSegment) | 916 S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | 917 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 918 S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | 919 S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | 920 S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | 921 S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | 922 S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | 923 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | 924 S_00B84C_EXCP_EN_MSB(0) | 925 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 926 S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | 927 S_00B84C_EXCP_EN(0); 928 929 if (STM.hasGFX90AInsts()) { 930 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, 931 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, 932 ProgInfo.AccumOffset); 933 AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, 934 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, 935 ProgInfo.TgSplit); 936 } 937 938 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize, 939 ProgInfo.NumSGPRsForWavesPerEU, 940 ProgInfo.NumVGPRsForWavesPerEU); 941 } 942 943 static unsigned getRsrcReg(CallingConv::ID CallConv) { 944 switch (CallConv) { 945 default: [[fallthrough]]; 946 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 947 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 948 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 949 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 950 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 951 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 952 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 953 } 954 } 955 956 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 957 const SIProgramInfo &CurrentProgramInfo) { 958 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 959 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 960 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 961 962 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 963 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 964 965 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); 966 967 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 968 OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); 969 970 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 971 OutStreamer->emitInt32( 972 STM.getGeneration() >= AMDGPUSubtarget::GFX11 973 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) 974 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); 975 976 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 977 // 0" comment but I don't see a corresponding field in the register spec. 978 } else { 979 OutStreamer->emitInt32(RsrcReg); 980 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | 981 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); 982 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 983 OutStreamer->emitInt32( 984 STM.getGeneration() >= AMDGPUSubtarget::GFX11 985 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks) 986 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks)); 987 } 988 989 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 990 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 991 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 992 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 993 : CurrentProgramInfo.LDSBlocks; 994 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 995 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 996 OutStreamer->emitInt32(MFI->getPSInputEnable()); 997 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 998 OutStreamer->emitInt32(MFI->getPSInputAddr()); 999 } 1000 1001 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1002 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1003 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1004 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1005 } 1006 1007 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1008 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1009 // metadata items into the PALMD::Metadata, combining with any provided by the 1010 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1011 // is then written as a single block in the .note section. 1012 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1013 const SIProgramInfo &CurrentProgramInfo) { 1014 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1015 auto CC = MF.getFunction().getCallingConv(); 1016 auto MD = getTargetStreamer()->getPALMetadata(); 1017 1018 MD->setEntryPoint(CC, MF.getFunction().getName()); 1019 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1020 1021 // Only set AGPRs for supported devices 1022 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1023 if (STM.hasMAIInsts()) { 1024 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); 1025 } 1026 1027 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1028 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); 1029 if (AMDGPU::isCompute(CC)) { 1030 MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); 1031 } else { 1032 if (CurrentProgramInfo.ScratchBlocks > 0) 1033 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); 1034 } 1035 // ScratchSize is in bytes, 16 aligned. 1036 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); 1037 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1038 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1039 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1040 : CurrentProgramInfo.LDSBlocks; 1041 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1042 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1043 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1044 } 1045 1046 if (STM.isWave32()) 1047 MD->setWave32(MF.getFunction().getCallingConv()); 1048 } 1049 1050 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { 1051 auto *MD = getTargetStreamer()->getPALMetadata(); 1052 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1053 MD->setFunctionScratchSize(MF, MFI.getStackSize()); 1054 1055 // Set compute registers 1056 MD->setRsrc1(CallingConv::AMDGPU_CS, 1057 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); 1058 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2); 1059 1060 // Set optional info 1061 MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); 1062 MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1063 MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1064 } 1065 1066 // This is supposed to be log2(Size) 1067 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1068 switch (Size) { 1069 case 4: 1070 return AMD_ELEMENT_4_BYTES; 1071 case 8: 1072 return AMD_ELEMENT_8_BYTES; 1073 case 16: 1074 return AMD_ELEMENT_16_BYTES; 1075 default: 1076 llvm_unreachable("invalid private_element_size"); 1077 } 1078 } 1079 1080 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, 1081 const SIProgramInfo &CurrentProgramInfo, 1082 const MachineFunction &MF) const { 1083 const Function &F = MF.getFunction(); 1084 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1085 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1086 1087 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1088 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1089 1090 AMDGPU::initDefaultAMDKernelCodeT(Out, &STM); 1091 1092 Out.compute_pgm_resource_registers = 1093 CurrentProgramInfo.getComputePGMRSrc1() | 1094 (CurrentProgramInfo.ComputePGMRSrc2 << 32); 1095 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1096 1097 if (CurrentProgramInfo.DynamicCallStack) 1098 Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; 1099 1100 AMD_HSA_BITS_SET(Out.code_properties, 1101 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1102 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1103 1104 if (MFI->hasPrivateSegmentBuffer()) { 1105 Out.code_properties |= 1106 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1107 } 1108 1109 if (MFI->hasDispatchPtr()) 1110 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1111 1112 if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) 1113 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1114 1115 if (MFI->hasKernargSegmentPtr()) 1116 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1117 1118 if (MFI->hasDispatchID()) 1119 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1120 1121 if (MFI->hasFlatScratchInit()) 1122 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1123 1124 if (MFI->hasDispatchPtr()) 1125 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1126 1127 if (STM.isXNACKEnabled()) 1128 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1129 1130 Align MaxKernArgAlign; 1131 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1132 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1133 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1134 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1135 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1136 1137 // kernarg_segment_alignment is specified as log of the alignment. 1138 // The minimum alignment is 16. 1139 // FIXME: The metadata treats the minimum as 4? 1140 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1141 } 1142 1143 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1144 const char *ExtraCode, raw_ostream &O) { 1145 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1146 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1147 return false; 1148 1149 if (ExtraCode && ExtraCode[0]) { 1150 if (ExtraCode[1] != 0) 1151 return true; // Unknown modifier. 1152 1153 switch (ExtraCode[0]) { 1154 case 'r': 1155 break; 1156 default: 1157 return true; 1158 } 1159 } 1160 1161 // TODO: Should be able to support other operand types like globals. 1162 const MachineOperand &MO = MI->getOperand(OpNo); 1163 if (MO.isReg()) { 1164 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1165 *MF->getSubtarget().getRegisterInfo()); 1166 return false; 1167 } else if (MO.isImm()) { 1168 int64_t Val = MO.getImm(); 1169 if (AMDGPU::isInlinableIntLiteral(Val)) { 1170 O << Val; 1171 } else if (isUInt<16>(Val)) { 1172 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1173 } else if (isUInt<32>(Val)) { 1174 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1175 } else { 1176 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1177 } 1178 return false; 1179 } 1180 return true; 1181 } 1182 1183 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { 1184 AU.addRequired<AMDGPUResourceUsageAnalysis>(); 1185 AU.addPreserved<AMDGPUResourceUsageAnalysis>(); 1186 AsmPrinter::getAnalysisUsage(AU); 1187 } 1188 1189 void AMDGPUAsmPrinter::emitResourceUsageRemarks( 1190 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, 1191 bool isModuleEntryFunction, bool hasMAIInsts) { 1192 if (!ORE) 1193 return; 1194 1195 const char *Name = "kernel-resource-usage"; 1196 const char *Indent = " "; 1197 1198 // If the remark is not specifically enabled, do not output to yaml 1199 LLVMContext &Ctx = MF.getFunction().getContext(); 1200 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) 1201 return; 1202 1203 auto EmitResourceUsageRemark = [&](StringRef RemarkName, 1204 StringRef RemarkLabel, auto Argument) { 1205 // Add an indent for every line besides the line with the kernel name. This 1206 // makes it easier to tell which resource usage go with which kernel since 1207 // the kernel name will always be displayed first. 1208 std::string LabelStr = RemarkLabel.str() + ": "; 1209 if (!RemarkName.equals("FunctionName")) 1210 LabelStr = Indent + LabelStr; 1211 1212 ORE->emit([&]() { 1213 return MachineOptimizationRemarkAnalysis(Name, RemarkName, 1214 MF.getFunction().getSubprogram(), 1215 &MF.front()) 1216 << LabelStr << ore::NV(RemarkName, Argument); 1217 }); 1218 }; 1219 1220 // FIXME: Formatting here is pretty nasty because clang does not accept 1221 // newlines from diagnostics. This forces us to emit multiple diagnostic 1222 // remarks to simulate newlines. If and when clang does accept newlines, this 1223 // formatting should be aggregated into one remark with newlines to avoid 1224 // printing multiple diagnostic location and diag opts. 1225 EmitResourceUsageRemark("FunctionName", "Function Name", 1226 MF.getFunction().getName()); 1227 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR); 1228 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR); 1229 if (hasMAIInsts) 1230 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR); 1231 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", 1232 CurrentProgramInfo.ScratchSize); 1233 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", 1234 CurrentProgramInfo.Occupancy); 1235 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", 1236 CurrentProgramInfo.SGPRSpill); 1237 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", 1238 CurrentProgramInfo.VGPRSpill); 1239 if (isModuleEntryFunction) 1240 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", 1241 CurrentProgramInfo.LDSSize); 1242 } 1243