1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUHSAMetadataStreamer.h" 21 #include "AMDGPUMCResourceInfo.h" 22 #include "AMDGPUResourceUsageAnalysis.h" 23 #include "GCNSubtarget.h" 24 #include "MCTargetDesc/AMDGPUInstPrinter.h" 25 #include "MCTargetDesc/AMDGPUMCExpr.h" 26 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" 27 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 28 #include "R600AsmPrinter.h" 29 #include "SIMachineFunctionInfo.h" 30 #include "TargetInfo/AMDGPUTargetInfo.h" 31 #include "Utils/AMDGPUBaseInfo.h" 32 #include "Utils/AMDKernelCodeTUtils.h" 33 #include "Utils/SIDefinesUtils.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/BinaryFormat/ELF.h" 36 #include "llvm/CodeGen/MachineFrameInfo.h" 37 #include "llvm/CodeGen/MachineModuleInfo.h" 38 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 39 #include "llvm/IR/DiagnosticInfo.h" 40 #include "llvm/MC/MCAssembler.h" 41 #include "llvm/MC/MCContext.h" 42 #include "llvm/MC/MCSectionELF.h" 43 #include "llvm/MC/MCStreamer.h" 44 #include "llvm/MC/TargetRegistry.h" 45 #include "llvm/Support/AMDHSAKernelDescriptor.h" 46 #include "llvm/Support/Compiler.h" 47 #include "llvm/Target/TargetLoweringObjectFile.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include "llvm/TargetParser/TargetParser.h" 50 51 using namespace llvm; 52 using namespace llvm::AMDGPU; 53 54 // This should get the default rounding mode from the kernel. We just set the 55 // default here, but this could change if the OpenCL rounding mode pragmas are 56 // used. 57 // 58 // The denormal mode here should match what is reported by the OpenCL runtime 59 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 60 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 61 // 62 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 63 // precision, and leaves single precision to flush all and does not report 64 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 65 // CL_FP_DENORM for both. 66 // 67 // FIXME: It seems some instructions do not support single precision denormals 68 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 69 // and sin_f32, cos_f32 on most parts). 70 71 // We want to use these instructions, and using fp32 denormals also causes 72 // instructions to run at the double precision rate for the device so it's 73 // probably best to just report no single precision denormals. 74 static uint32_t getFPMode(SIModeRegisterDefaults Mode) { 75 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 76 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 77 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 78 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 79 } 80 81 static AsmPrinter * 82 createAMDGPUAsmPrinterPass(TargetMachine &tm, 83 std::unique_ptr<MCStreamer> &&Streamer) { 84 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 85 } 86 87 extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void 88 LLVMInitializeAMDGPUAsmPrinter() { 89 TargetRegistry::RegisterAsmPrinter(getTheR600Target(), 90 llvm::createR600AsmPrinterPass); 91 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 92 createAMDGPUAsmPrinterPass); 93 } 94 95 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 96 std::unique_ptr<MCStreamer> Streamer) 97 : AsmPrinter(TM, std::move(Streamer)) { 98 assert(OutStreamer && "AsmPrinter constructed without streamer"); 99 } 100 101 StringRef AMDGPUAsmPrinter::getPassName() const { 102 return "AMDGPU Assembly Printer"; 103 } 104 105 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 106 return TM.getMCSubtargetInfo(); 107 } 108 109 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 110 if (!OutStreamer) 111 return nullptr; 112 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 113 } 114 115 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 116 IsTargetStreamerInitialized = false; 117 } 118 119 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { 120 IsTargetStreamerInitialized = true; 121 122 // TODO: Which one is called first, emitStartOfAsmFile or 123 // emitFunctionBodyStart? 124 if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) 125 initializeTargetID(M); 126 127 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 128 TM.getTargetTriple().getOS() != Triple::AMDPAL) 129 return; 130 131 getTargetStreamer()->EmitDirectiveAMDGCNTarget(); 132 133 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 134 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( 135 CodeObjectVersion); 136 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); 137 } 138 139 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 140 getTargetStreamer()->getPALMetadata()->readFromIR(M); 141 } 142 143 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 144 // Init target streamer if it has not yet happened 145 if (!IsTargetStreamerInitialized) 146 initTargetStreamer(M); 147 148 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 149 getTargetStreamer()->EmitISAVersion(); 150 151 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 152 // Emit HSA Metadata (NT_AMD_HSA_METADATA). 153 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 154 HSAMetadataStream->end(); 155 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 156 (void)Success; 157 assert(Success && "Malformed HSA Metadata"); 158 } 159 } 160 161 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 162 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 163 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 164 const Function &F = MF->getFunction(); 165 166 // TODO: We're checking this late, would be nice to check it earlier. 167 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { 168 reportFatalUsageError( 169 STM.getCPU() + " is only available on code object version 6 or better"); 170 } 171 172 // TODO: Which one is called first, emitStartOfAsmFile or 173 // emitFunctionBodyStart? 174 if (!getTargetStreamer()->getTargetID()) 175 initializeTargetID(*F.getParent()); 176 177 const auto &FunctionTargetID = STM.getTargetID(); 178 // Make sure function's xnack settings are compatible with module's 179 // xnack settings. 180 if (FunctionTargetID.isXnackSupported() && 181 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && 182 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { 183 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + 184 "' function does not match module xnack setting"); 185 return; 186 } 187 // Make sure function's sramecc settings are compatible with module's 188 // sramecc settings. 189 if (FunctionTargetID.isSramEccSupported() && 190 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && 191 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { 192 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + 193 "' function does not match module sramecc setting"); 194 return; 195 } 196 197 if (!MFI.isEntryFunction()) 198 return; 199 200 if (STM.isMesaKernel(F) && 201 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 202 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 203 AMDGPUMCKernelCodeT KernelCode; 204 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 205 KernelCode.validate(&STM, MF->getContext()); 206 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 207 } 208 209 if (STM.isAmdHsaOS()) 210 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 211 } 212 213 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 214 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 215 if (!MFI.isEntryFunction()) 216 return; 217 218 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 219 return; 220 221 auto &Streamer = getTargetStreamer()->getStreamer(); 222 auto &Context = Streamer.getContext(); 223 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 224 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 225 226 Streamer.pushSection(); 227 Streamer.switchSection(&ReadOnlySection); 228 229 // CP microcode requires the kernel descriptor to be allocated on 64 byte 230 // alignment. 231 Streamer.emitValueToAlignment(Align(64), 0, 1, 0); 232 ReadOnlySection.ensureMinAlignment(Align(64)); 233 234 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 235 236 SmallString<128> KernelName; 237 getNameWithPrefix(KernelName, &MF->getFunction()); 238 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 239 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 240 CurrentProgramInfo.NumVGPRsForWavesPerEU, 241 MCBinaryExpr::createSub( 242 CurrentProgramInfo.NumSGPRsForWavesPerEU, 243 AMDGPUMCExpr::createExtraSGPRs( 244 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 245 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context), 246 Context), 247 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); 248 249 Streamer.popSection(); 250 } 251 252 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { 253 Register RegNo = MI->getOperand(0).getReg(); 254 255 SmallString<128> Str; 256 raw_svector_ostream OS(Str); 257 OS << "implicit-def: " 258 << printReg(RegNo, MF->getSubtarget().getRegisterInfo()); 259 260 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) 261 OS << " : SGPR spill to VGPR lane"; 262 263 OutStreamer->AddComment(OS.str()); 264 OutStreamer->addBlankLine(); 265 } 266 267 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 268 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 269 AsmPrinter::emitFunctionEntryLabel(); 270 return; 271 } 272 273 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 274 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 275 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 276 SmallString<128> SymbolName; 277 getNameWithPrefix(SymbolName, &MF->getFunction()), 278 getTargetStreamer()->EmitAMDGPUSymbolType( 279 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 280 } 281 if (DumpCodeInstEmitter) { 282 // Disassemble function name label to text. 283 DisasmLines.push_back(MF->getName().str() + ":"); 284 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 285 HexLines.emplace_back(""); 286 } 287 288 AsmPrinter::emitFunctionEntryLabel(); 289 } 290 291 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 292 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 293 // Write a line for the basic block label if it is not only fallthrough. 294 DisasmLines.push_back( 295 (Twine("BB") + Twine(getFunctionNumber()) 296 + "_" + Twine(MBB.getNumber()) + ":").str()); 297 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 298 HexLines.emplace_back(""); 299 } 300 AsmPrinter::emitBasicBlockStart(MBB); 301 } 302 303 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 304 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 305 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 306 OutContext.reportError({}, 307 Twine(GV->getName()) + 308 ": unsupported initializer for address space"); 309 return; 310 } 311 312 // LDS variables aren't emitted in HSA or PAL yet. 313 const Triple::OSType OS = TM.getTargetTriple().getOS(); 314 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 315 return; 316 317 MCSymbol *GVSym = getSymbol(GV); 318 319 GVSym->redefineIfPossible(); 320 if (GVSym->isDefined() || GVSym->isVariable()) 321 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 322 "' is already defined"); 323 324 const DataLayout &DL = GV->getDataLayout(); 325 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 326 Align Alignment = GV->getAlign().value_or(Align(4)); 327 328 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 329 emitLinkage(GV, GVSym); 330 auto *TS = getTargetStreamer(); 331 TS->emitAMDGPULDS(GVSym, Size, Alignment); 332 return; 333 } 334 335 AsmPrinter::emitGlobalVariable(GV); 336 } 337 338 bool AMDGPUAsmPrinter::doInitialization(Module &M) { 339 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); 340 341 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 342 switch (CodeObjectVersion) { 343 case AMDGPU::AMDHSA_COV4: 344 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>(); 345 break; 346 case AMDGPU::AMDHSA_COV5: 347 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>(); 348 break; 349 case AMDGPU::AMDHSA_COV6: 350 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>(); 351 break; 352 default: 353 reportFatalUsageError("unsupported code object version"); 354 } 355 } 356 357 return AsmPrinter::doInitialization(M); 358 } 359 360 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) { 361 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())) 362 return; 363 364 using RIK = MCResourceInfo::ResourceInfoKind; 365 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 366 MCSymbol *FnSym = TM.getSymbol(&F); 367 bool IsLocal = F.hasLocalLinkage(); 368 369 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { 370 int64_t Val; 371 if (Value->evaluateAsAbsolute(Val)) { 372 Res = Val; 373 return true; 374 } 375 return false; 376 }; 377 378 const uint64_t MaxScratchPerWorkitem = 379 STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); 380 MCSymbol *ScratchSizeSymbol = RI.getSymbol( 381 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal); 382 uint64_t ScratchSize; 383 if (ScratchSizeSymbol->isVariable() && 384 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) && 385 ScratchSize > MaxScratchPerWorkitem) { 386 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem, 387 DS_Error); 388 F.getContext().diagnose(DiagStackSize); 389 } 390 391 // Validate addressable scalar registers (i.e., prior to added implicit 392 // SGPRs). 393 MCSymbol *NumSGPRSymbol = 394 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal); 395 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 396 !STM.hasSGPRInitBug()) { 397 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 398 uint64_t NumSgpr; 399 if (NumSGPRSymbol->isVariable() && 400 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && 401 NumSgpr > MaxAddressableNumSGPRs) { 402 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers", 403 NumSgpr, MaxAddressableNumSGPRs, 404 DS_Error, DK_ResourceLimit); 405 F.getContext().diagnose(Diag); 406 return; 407 } 408 } 409 410 MCSymbol *VCCUsedSymbol = 411 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal); 412 MCSymbol *FlatUsedSymbol = RI.getSymbol( 413 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal); 414 uint64_t VCCUsed, FlatUsed, NumSgpr; 415 416 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() && 417 FlatUsedSymbol->isVariable() && 418 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) && 419 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) && 420 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) { 421 422 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be 423 // resolvable. 424 NumSgpr += IsaInfo::getNumExtraSGPRs( 425 &STM, VCCUsed, FlatUsed, 426 getTargetStreamer()->getTargetID()->isXnackOnOrAny()); 427 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 428 STM.hasSGPRInitBug()) { 429 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 430 if (NumSgpr > MaxAddressableNumSGPRs) { 431 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr, 432 MaxAddressableNumSGPRs, DS_Error, 433 DK_ResourceLimit); 434 F.getContext().diagnose(Diag); 435 return; 436 } 437 } 438 439 MCSymbol *NumVgprSymbol = 440 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal); 441 MCSymbol *NumAgprSymbol = 442 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal); 443 uint64_t NumVgpr, NumAgpr; 444 445 MachineModuleInfo &MMI = 446 getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); 447 MachineFunction *MF = MMI.getMachineFunction(F); 448 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() && 449 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) && 450 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) { 451 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 452 unsigned MaxWaves = MFI.getMaxWavesPerEU(); 453 uint64_t TotalNumVgpr = 454 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr); 455 uint64_t NumVGPRsForWavesPerEU = 456 std::max({TotalNumVgpr, (uint64_t)1, 457 (uint64_t)STM.getMinNumVGPRs( 458 MaxWaves, MFI.getDynamicVGPRBlockSize())}); 459 uint64_t NumSGPRsForWavesPerEU = std::max( 460 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)}); 461 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy( 462 STM.getOccupancyWithWorkGroupSizes(*MF).second, 463 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext), 464 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), 465 MFI.getDynamicVGPRBlockSize(), STM, OutContext); 466 uint64_t Occupancy; 467 468 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute( 469 F, "amdgpu-waves-per-eu", {0, 0}, true); 470 471 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) { 472 DiagnosticInfoOptimizationFailure Diag( 473 F, F.getSubprogram(), 474 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " 475 "'" + 476 F.getName() + "': desired occupancy was " + Twine(MinWEU) + 477 ", final occupancy is " + Twine(Occupancy)); 478 F.getContext().diagnose(Diag); 479 return; 480 } 481 } 482 } 483 } 484 485 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 486 // Pad with s_code_end to help tools and guard against instruction prefetch 487 // causing stale data in caches. Arguably this should be done by the linker, 488 // which is why this isn't done for Mesa. 489 const MCSubtargetInfo &STI = *getGlobalSTI(); 490 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && 491 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 492 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 493 OutStreamer->switchSection(getObjFileLowering().getTextSection()); 494 getTargetStreamer()->EmitCodeEnd(STI); 495 } 496 497 // Assign expressions which can only be resolved when all other functions are 498 // known. 499 RI.finalize(OutContext); 500 501 // Switch section and emit all GPR maximums within the processed module. 502 OutStreamer->pushSection(); 503 MCSectionELF *MaxGPRSection = 504 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0); 505 OutStreamer->switchSection(MaxGPRSection); 506 getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext), 507 RI.getMaxAGPRSymbol(OutContext), 508 RI.getMaxSGPRSymbol(OutContext)); 509 OutStreamer->popSection(); 510 511 for (Function &F : M.functions()) 512 validateMCResourceInfo(F); 513 514 RI.reset(); 515 516 return AsmPrinter::doFinalization(M); 517 } 518 519 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { 520 SmallString<128> Str; 521 raw_svector_ostream OSS(Str); 522 auto &Streamer = getTargetStreamer()->getStreamer(); 523 auto &Context = Streamer.getContext(); 524 const MCExpr *New = foldAMDGPUMCExpr(Value, Context); 525 printAMDGPUMCExpr(New, OSS, MAI); 526 return Str; 527 } 528 529 // Print comments that apply to both callable functions and entry points. 530 void AMDGPUAsmPrinter::emitCommonFunctionComments( 531 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, 532 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, 533 const AMDGPUMachineFunction *MFI) { 534 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 535 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR), 536 false); 537 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); 538 if (NumAGPR && TotalNumVGPR) { 539 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); 540 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR), 541 false); 542 } 543 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize), 544 false); 545 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 546 false); 547 } 548 549 const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 550 const MachineFunction &MF) const { 551 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 552 MCContext &Ctx = MF.getContext(); 553 uint16_t KernelCodeProperties = 0; 554 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); 555 556 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 557 KernelCodeProperties |= 558 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 559 } 560 if (UserSGPRInfo.hasDispatchPtr()) { 561 KernelCodeProperties |= 562 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 563 } 564 if (UserSGPRInfo.hasQueuePtr()) { 565 KernelCodeProperties |= 566 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 567 } 568 if (UserSGPRInfo.hasKernargSegmentPtr()) { 569 KernelCodeProperties |= 570 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 571 } 572 if (UserSGPRInfo.hasDispatchID()) { 573 KernelCodeProperties |= 574 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 575 } 576 if (UserSGPRInfo.hasFlatScratchInit()) { 577 KernelCodeProperties |= 578 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 579 } 580 if (UserSGPRInfo.hasPrivateSegmentSize()) { 581 KernelCodeProperties |= 582 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 583 } 584 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 585 KernelCodeProperties |= 586 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 587 } 588 589 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be 590 // un-evaluatable at this point so it cannot be conditionally checked here. 591 // Instead, we'll directly shift the possibly unknown MCExpr into its place 592 // and bitwise-or it into KernelCodeProperties. 593 const MCExpr *KernelCodePropExpr = 594 MCConstantExpr::create(KernelCodeProperties, Ctx); 595 const MCExpr *OrValue = MCConstantExpr::create( 596 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); 597 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack, 598 OrValue, Ctx); 599 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx); 600 601 return KernelCodePropExpr; 602 } 603 604 MCKernelDescriptor 605 AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, 606 const SIProgramInfo &PI) const { 607 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 608 const Function &F = MF.getFunction(); 609 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 610 MCContext &Ctx = MF.getContext(); 611 612 MCKernelDescriptor KernelDescriptor; 613 614 KernelDescriptor.group_segment_fixed_size = 615 MCConstantExpr::create(PI.LDSSize, Ctx); 616 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 617 618 Align MaxKernArgAlign; 619 KernelDescriptor.kernarg_size = MCConstantExpr::create( 620 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); 621 622 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); 623 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); 624 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 625 626 int64_t PGRM_Rsrc3 = 1; 627 bool EvaluatableRsrc3 = 628 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3); 629 (void)PGRM_Rsrc3; 630 (void)EvaluatableRsrc3; 631 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || 632 STM.hasGFX90AInsts() || !EvaluatableRsrc3 || 633 static_cast<uint64_t>(PGRM_Rsrc3) == 0); 634 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3; 635 636 KernelDescriptor.kernarg_preload = MCConstantExpr::create( 637 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0, 638 Ctx); 639 640 return KernelDescriptor; 641 } 642 643 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 644 // Init target streamer lazily on the first function so that previous passes 645 // can set metadata. 646 if (!IsTargetStreamerInitialized) 647 initTargetStreamer(*MF.getFunction().getParent()); 648 649 ResourceUsage = 650 &getAnalysis<AMDGPUResourceUsageAnalysisWrapperPass>().getResourceInfo(); 651 CurrentProgramInfo.reset(MF); 652 653 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 654 MCContext &Ctx = MF.getContext(); 655 656 // The starting address of all shader programs must be 256 bytes aligned. 657 // Regular functions just need the basic required instruction alignment. 658 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 659 660 SetupMachineFunction(MF); 661 662 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 663 MCContext &Context = getObjFileLowering().getContext(); 664 bool IsLocal = MF.getFunction().hasLocalLinkage(); 665 // FIXME: This should be an explicit check for Mesa. 666 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 667 MCSectionELF *ConfigSection = 668 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 669 OutStreamer->switchSection(ConfigSection); 670 } 671 672 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext); 673 674 if (MFI->isModuleEntryFunction()) { 675 getSIProgramInfo(CurrentProgramInfo, MF); 676 } 677 678 if (STM.isAmdPalOS()) { 679 if (MFI->isEntryFunction()) 680 EmitPALMetadata(MF, CurrentProgramInfo); 681 else if (MFI->isModuleEntryFunction()) 682 emitPALFunctionMetadata(MF); 683 } else if (!STM.isAmdHsaOS()) { 684 EmitProgramInfoSI(MF, CurrentProgramInfo); 685 } 686 687 DumpCodeInstEmitter = nullptr; 688 if (STM.dumpCode()) { 689 // For -dumpcode, get the assembler out of the streamer. This only works 690 // with -filetype=obj. 691 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 692 if (Assembler) 693 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 694 } 695 696 DisasmLines.clear(); 697 HexLines.clear(); 698 DisasmLineMaxLen = 0; 699 700 emitFunctionBody(); 701 702 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), 703 STM.hasMAIInsts()); 704 705 { 706 using RIK = MCResourceInfo::ResourceInfoKind; 707 getTargetStreamer()->EmitMCResourceInfo( 708 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext, 709 IsLocal), 710 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext, 711 IsLocal), 712 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext, 713 IsLocal), 714 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, 715 OutContext, IsLocal), 716 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext, 717 IsLocal), 718 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch, 719 OutContext, IsLocal), 720 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack, 721 OutContext, IsLocal), 722 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext, 723 IsLocal), 724 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall, 725 OutContext, IsLocal)); 726 } 727 728 if (isVerbose()) { 729 MCSectionELF *CommentSection = 730 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 731 OutStreamer->switchSection(CommentSection); 732 733 if (!MFI->isEntryFunction()) { 734 using RIK = MCResourceInfo::ResourceInfoKind; 735 OutStreamer->emitRawComment(" Function info:", false); 736 737 emitCommonFunctionComments( 738 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext, 739 IsLocal) 740 ->getVariableValue(), 741 STM.hasMAIInsts() 742 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, 743 OutContext, IsLocal) 744 ->getVariableValue() 745 : nullptr, 746 RI.createTotalNumVGPRs(MF, Ctx), 747 RI.createTotalNumSGPRs( 748 MF, 749 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(), 750 Ctx), 751 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize, 752 OutContext, IsLocal) 753 ->getVariableValue(), 754 CurrentProgramInfo.getFunctionCodeSize(MF), MFI); 755 return false; 756 } 757 758 OutStreamer->emitRawComment(" Kernel info:", false); 759 emitCommonFunctionComments( 760 CurrentProgramInfo.NumArchVGPR, 761 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, 762 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, 763 CurrentProgramInfo.ScratchSize, 764 CurrentProgramInfo.getFunctionCodeSize(MF), MFI); 765 766 OutStreamer->emitRawComment( 767 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 768 OutStreamer->emitRawComment( 769 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 770 OutStreamer->emitRawComment( 771 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 772 " bytes/workgroup (compile time only)", false); 773 774 OutStreamer->emitRawComment( 775 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false); 776 777 OutStreamer->emitRawComment( 778 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false); 779 780 OutStreamer->emitRawComment( 781 " NumSGPRsForWavesPerEU: " + 782 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU), 783 false); 784 OutStreamer->emitRawComment( 785 " NumVGPRsForWavesPerEU: " + 786 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU), 787 false); 788 789 if (STM.hasGFX90AInsts()) { 790 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( 791 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx); 792 AdjustedAccum = MCBinaryExpr::createMul( 793 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx); 794 OutStreamer->emitRawComment( 795 " AccumOffset: " + getMCExprStr(AdjustedAccum), false); 796 } 797 798 OutStreamer->emitRawComment( 799 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); 800 801 OutStreamer->emitRawComment( 802 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 803 804 OutStreamer->emitRawComment( 805 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + 806 getMCExprStr(CurrentProgramInfo.ScratchEnable), 807 false); 808 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 809 Twine(CurrentProgramInfo.UserSGPR), 810 false); 811 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 812 Twine(CurrentProgramInfo.TrapHandlerEnable), 813 false); 814 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 815 Twine(CurrentProgramInfo.TGIdXEnable), 816 false); 817 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 818 Twine(CurrentProgramInfo.TGIdYEnable), 819 false); 820 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 821 Twine(CurrentProgramInfo.TGIdZEnable), 822 false); 823 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 824 Twine(CurrentProgramInfo.TIdIGCompCount), 825 false); 826 827 [[maybe_unused]] int64_t PGMRSrc3; 828 assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 || 829 STM.hasGFX90AInsts() || 830 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) && 831 static_cast<uint64_t>(PGMRSrc3) == 0)); 832 if (STM.hasGFX90AInsts()) { 833 OutStreamer->emitRawComment( 834 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + 835 getMCExprStr(MCKernelDescriptor::bits_get( 836 CurrentProgramInfo.ComputePGMRSrc3, 837 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, 838 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), 839 false); 840 OutStreamer->emitRawComment( 841 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + 842 getMCExprStr(MCKernelDescriptor::bits_get( 843 CurrentProgramInfo.ComputePGMRSrc3, 844 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, 845 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), 846 false); 847 } 848 } 849 850 if (DumpCodeInstEmitter) { 851 852 OutStreamer->switchSection( 853 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 854 855 for (size_t i = 0; i < DisasmLines.size(); ++i) { 856 std::string Comment = "\n"; 857 if (!HexLines[i].empty()) { 858 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 859 Comment += " ; " + HexLines[i] + "\n"; 860 } 861 862 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 863 OutStreamer->emitBytes(StringRef(Comment)); 864 } 865 } 866 867 return false; 868 } 869 870 // TODO: Fold this into emitFunctionBodyStart. 871 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { 872 // In the beginning all features are either 'Any' or 'NotSupported', 873 // depending on global target features. This will cover empty modules. 874 getTargetStreamer()->initializeTargetID(*getGlobalSTI(), 875 getGlobalSTI()->getFeatureString()); 876 877 // If module is empty, we are done. 878 if (M.empty()) 879 return; 880 881 // If module is not empty, need to find first 'Off' or 'On' feature 882 // setting per feature from functions in module. 883 for (auto &F : M) { 884 auto &TSTargetID = getTargetStreamer()->getTargetID(); 885 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && 886 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) 887 break; 888 889 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 890 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); 891 if (TSTargetID->isXnackSupported()) 892 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) 893 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); 894 if (TSTargetID->isSramEccSupported()) 895 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) 896 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); 897 } 898 } 899 900 // AccumOffset computed for the MCExpr equivalent of: 901 // alignTo(std::max(1, NumVGPR), 4) / 4 - 1; 902 static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) { 903 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx); 904 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx); 905 906 // Can't be lower than 1 for subsequent alignTo. 907 const MCExpr *MaximumTaken = 908 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx); 909 910 // Practically, it's computing divideCeil(MaximumTaken, 4). 911 const MCExpr *DivCeil = MCBinaryExpr::createDiv( 912 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour, 913 Ctx); 914 915 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx); 916 } 917 918 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 919 const MachineFunction &MF) { 920 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 921 bool IsLocal = MF.getFunction().hasLocalLinkage(); 922 MCContext &Ctx = MF.getContext(); 923 924 auto CreateExpr = [&Ctx](int64_t Value) { 925 return MCConstantExpr::create(Value, Ctx); 926 }; 927 928 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { 929 int64_t Val; 930 if (Value->evaluateAsAbsolute(Val)) { 931 Res = Val; 932 return true; 933 } 934 return false; 935 }; 936 937 auto GetSymRefExpr = 938 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * { 939 MCSymbol *Sym = 940 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal); 941 return MCSymbolRefExpr::create(Sym, Ctx); 942 }; 943 944 using RIK = MCResourceInfo::ResourceInfoKind; 945 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR); 946 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR); 947 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( 948 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); 949 950 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx); 951 ProgInfo.TgSplit = STM.isTgSplitEnabled(); 952 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR); 953 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize); 954 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC); 955 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch); 956 ProgInfo.DynamicCallStack = 957 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack), 958 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx); 959 960 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 961 962 // The calculations related to SGPR/VGPR blocks are 963 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 964 // unified. 965 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs( 966 ProgInfo.VCCUsed, ProgInfo.FlatUsed, 967 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); 968 969 // Check the addressable register limit before we add ExtraSGPRs. 970 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 971 !STM.hasSGPRInitBug()) { 972 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 973 uint64_t NumSgpr; 974 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 975 NumSgpr > MaxAddressableNumSGPRs) { 976 // This can happen due to a compiler bug or when using inline asm. 977 LLVMContext &Ctx = MF.getFunction().getContext(); 978 DiagnosticInfoResourceLimit Diag( 979 MF.getFunction(), "addressable scalar registers", NumSgpr, 980 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); 981 Ctx.diagnose(Diag); 982 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1); 983 } 984 } 985 986 // Account for extra SGPRs and VGPRs reserved for debugger use. 987 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx); 988 989 const Function &F = MF.getFunction(); 990 991 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 992 // dispatch registers are function args. 993 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 994 995 if (isShader(F.getCallingConv())) { 996 bool IsPixelShader = 997 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); 998 999 // Calculate the number of VGPR registers based on the SPI input registers 1000 uint32_t InputEna = 0; 1001 uint32_t InputAddr = 0; 1002 unsigned LastEna = 0; 1003 1004 if (IsPixelShader) { 1005 // Note for IsPixelShader: 1006 // By this stage, all enabled inputs are tagged in InputAddr as well. 1007 // We will use InputAddr to determine whether the input counts against the 1008 // vgpr total and only use the InputEnable to determine the last input 1009 // that is relevant - if extra arguments are used, then we have to honour 1010 // the InputAddr for any intermediate non-enabled inputs. 1011 InputEna = MFI->getPSInputEnable(); 1012 InputAddr = MFI->getPSInputAddr(); 1013 1014 // We only need to consider input args up to the last used arg. 1015 assert((InputEna || InputAddr) && 1016 "PSInputAddr and PSInputEnable should " 1017 "never both be 0 for AMDGPU_PS shaders"); 1018 // There are some rare circumstances where InputAddr is non-zero and 1019 // InputEna can be set to 0. In this case we default to setting LastEna 1020 // to 1. 1021 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; 1022 } 1023 1024 // FIXME: We should be using the number of registers determined during 1025 // calling convention lowering to legalize the types. 1026 const DataLayout &DL = F.getDataLayout(); 1027 unsigned PSArgCount = 0; 1028 unsigned IntermediateVGPR = 0; 1029 for (auto &Arg : F.args()) { 1030 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; 1031 if (Arg.hasAttribute(Attribute::InReg)) { 1032 WaveDispatchNumSGPR += NumRegs; 1033 } else { 1034 // If this is a PS shader and we're processing the PS Input args (first 1035 // 16 VGPR), use the InputEna and InputAddr bits to define how many 1036 // VGPRs are actually used. 1037 // Any extra VGPR arguments are handled as normal arguments (and 1038 // contribute to the VGPR count whether they're used or not). 1039 if (IsPixelShader && PSArgCount < 16) { 1040 if ((1 << PSArgCount) & InputAddr) { 1041 if (PSArgCount < LastEna) 1042 WaveDispatchNumVGPR += NumRegs; 1043 else 1044 IntermediateVGPR += NumRegs; 1045 } 1046 PSArgCount++; 1047 } else { 1048 // If there are extra arguments we have to include the allocation for 1049 // the non-used (but enabled with InputAddr) input arguments 1050 if (IntermediateVGPR) { 1051 WaveDispatchNumVGPR += IntermediateVGPR; 1052 IntermediateVGPR = 0; 1053 } 1054 WaveDispatchNumVGPR += NumRegs; 1055 } 1056 } 1057 } 1058 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( 1059 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); 1060 1061 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( 1062 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); 1063 1064 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( 1065 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); 1066 } else if (isKernel(F.getCallingConv()) && 1067 MFI->getNumKernargPreloadedSGPRs()) { 1068 // Consider cases where the total number of UserSGPRs with trailing 1069 // allocated preload SGPRs, is greater than the number of explicitly 1070 // referenced SGPRs. 1071 const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd( 1072 CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx); 1073 ProgInfo.NumSGPR = 1074 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx); 1075 } 1076 1077 // Adjust number of registers used to meet default/requested minimum/maximum 1078 // number of waves per execution unit request. 1079 unsigned MaxWaves = MFI->getMaxWavesPerEU(); 1080 ProgInfo.NumSGPRsForWavesPerEU = 1081 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul), 1082 CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, 1083 Ctx); 1084 ProgInfo.NumVGPRsForWavesPerEU = 1085 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), 1086 CreateExpr(STM.getMinNumVGPRs( 1087 MaxWaves, MFI->getDynamicVGPRBlockSize()))}, 1088 Ctx); 1089 1090 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 1091 STM.hasSGPRInitBug()) { 1092 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 1093 uint64_t NumSgpr; 1094 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 1095 NumSgpr > MaxAddressableNumSGPRs) { 1096 // This can happen due to a compiler bug or when using inline asm to use 1097 // the registers which are usually reserved for vcc etc. 1098 LLVMContext &Ctx = MF.getFunction().getContext(); 1099 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", 1100 NumSgpr, MaxAddressableNumSGPRs, 1101 DS_Error, DK_ResourceLimit); 1102 Ctx.diagnose(Diag); 1103 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs); 1104 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs); 1105 } 1106 } 1107 1108 if (STM.hasSGPRInitBug()) { 1109 ProgInfo.NumSGPR = 1110 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 1111 ProgInfo.NumSGPRsForWavesPerEU = 1112 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 1113 } 1114 1115 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 1116 LLVMContext &Ctx = MF.getFunction().getContext(); 1117 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 1118 MFI->getNumUserSGPRs(), 1119 STM.getMaxNumUserSGPRs(), DS_Error); 1120 Ctx.diagnose(Diag); 1121 } 1122 1123 if (MFI->getLDSSize() > 1124 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { 1125 LLVMContext &Ctx = MF.getFunction().getContext(); 1126 DiagnosticInfoResourceLimit Diag( 1127 MF.getFunction(), "local memory", MFI->getLDSSize(), 1128 STM.getAddressableLocalMemorySize(), DS_Error); 1129 Ctx.diagnose(Diag); 1130 } 1131 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: 1132 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 1133 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR, 1134 unsigned Granule) { 1135 const MCExpr *OneConst = CreateExpr(1ul); 1136 const MCExpr *GranuleConst = CreateExpr(Granule); 1137 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); 1138 const MCExpr *AlignToGPR = 1139 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); 1140 const MCExpr *DivGPR = 1141 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); 1142 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); 1143 return SubGPR; 1144 }; 1145 1146 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, 1147 IsaInfo::getSGPREncodingGranule(&STM)); 1148 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, 1149 IsaInfo::getVGPREncodingGranule(&STM)); 1150 1151 const SIModeRegisterDefaults Mode = MFI->getMode(); 1152 1153 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 1154 // register. 1155 ProgInfo.FloatMode = getFPMode(Mode); 1156 1157 ProgInfo.IEEEMode = Mode.IEEE; 1158 1159 // Make clamp modifier on NaN input returns 0. 1160 ProgInfo.DX10Clamp = Mode.DX10Clamp; 1161 1162 unsigned LDSAlignShift; 1163 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { 1164 // LDS is allocated in 320 dword blocks. 1165 LDSAlignShift = 11; 1166 } else if (STM.getFeatureBits().test( 1167 FeatureAddressableLocalMemorySize65536)) { 1168 // LDS is allocated in 128 dword blocks. 1169 LDSAlignShift = 9; 1170 } else { 1171 // LDS is allocated in 64 dword blocks. 1172 LDSAlignShift = 8; 1173 } 1174 1175 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); 1176 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); 1177 1178 ProgInfo.LDSSize = MFI->getLDSSize(); 1179 ProgInfo.LDSBlocks = 1180 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 1181 1182 // The MCExpr equivalent of divideCeil. 1183 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { 1184 const MCExpr *Ceil = 1185 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx); 1186 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); 1187 }; 1188 1189 // Scratch is allocated in 64-dword or 256-dword blocks. 1190 unsigned ScratchAlignShift = 1191 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; 1192 // We need to program the hardware with the amount of scratch memory that 1193 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 1194 // scratch memory used per thread. 1195 ProgInfo.ScratchBlocks = DivideCeil( 1196 MCBinaryExpr::createMul(ProgInfo.ScratchSize, 1197 CreateExpr(STM.getWavefrontSize()), Ctx), 1198 CreateExpr(1ULL << ScratchAlignShift)); 1199 1200 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 1201 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 1202 ProgInfo.MemOrdered = 1; 1203 ProgInfo.FwdProgress = 1; 1204 } 1205 1206 // 0 = X, 1 = XY, 2 = XYZ 1207 unsigned TIDIGCompCnt = 0; 1208 if (MFI->hasWorkItemIDZ()) 1209 TIDIGCompCnt = 2; 1210 else if (MFI->hasWorkItemIDY()) 1211 TIDIGCompCnt = 1; 1212 1213 // The private segment wave byte offset is the last of the system SGPRs. We 1214 // initially assumed it was allocated, and may have used it. It shouldn't harm 1215 // anything to disable it if we know the stack isn't used here. We may still 1216 // have emitted code reading it to initialize scratch, but if that's unused 1217 // reading garbage should be OK. 1218 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr( 1219 MCBinaryExpr::createGT(ProgInfo.ScratchBlocks, 1220 MCConstantExpr::create(0, Ctx), Ctx), 1221 ProgInfo.DynamicCallStack, Ctx); 1222 1223 ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); 1224 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 1225 ProgInfo.TrapHandlerEnable = 1226 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); 1227 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); 1228 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); 1229 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); 1230 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); 1231 ProgInfo.TIdIGCompCount = TIDIGCompCnt; 1232 ProgInfo.EXCPEnMSB = 0; 1233 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 1234 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; 1235 ProgInfo.EXCPEnable = 0; 1236 1237 // return ((Dst & ~Mask) | (Value << Shift)) 1238 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, 1239 uint32_t Shift) { 1240 const auto *Shft = MCConstantExpr::create(Shift, Ctx); 1241 const auto *Msk = MCConstantExpr::create(Mask, Ctx); 1242 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); 1243 Dst = MCBinaryExpr::createOr(Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), 1244 Ctx); 1245 return Dst; 1246 }; 1247 1248 if (STM.hasGFX90AInsts()) { 1249 ProgInfo.ComputePGMRSrc3 = 1250 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset, 1251 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, 1252 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); 1253 ProgInfo.ComputePGMRSrc3 = 1254 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit), 1255 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, 1256 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); 1257 } 1258 1259 ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( 1260 STM.computeOccupancy(F, ProgInfo.LDSSize).second, 1261 ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, 1262 MFI->getDynamicVGPRBlockSize(), STM, Ctx); 1263 1264 const auto [MinWEU, MaxWEU] = 1265 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); 1266 uint64_t Occupancy; 1267 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) { 1268 DiagnosticInfoOptimizationFailure Diag( 1269 F, F.getSubprogram(), 1270 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " 1271 "'" + 1272 F.getName() + "': desired occupancy was " + Twine(MinWEU) + 1273 ", final occupancy is " + Twine(Occupancy)); 1274 F.getContext().diagnose(Diag); 1275 } 1276 1277 if (isGFX11Plus(STM)) { 1278 uint32_t CodeSizeInBytes = (uint32_t)std::min( 1279 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */), 1280 (uint64_t)std::numeric_limits<uint32_t>::max()); 1281 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128); 1282 uint32_t Field, Shift, Width; 1283 if (isGFX11(STM)) { 1284 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE; 1285 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT; 1286 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH; 1287 } else { 1288 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE; 1289 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT; 1290 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH; 1291 } 1292 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1); 1293 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3, 1294 CreateExpr(InstPrefSize), Field, Shift); 1295 } 1296 } 1297 1298 static unsigned getRsrcReg(CallingConv::ID CallConv) { 1299 switch (CallConv) { 1300 default: [[fallthrough]]; 1301 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 1302 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 1303 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 1304 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 1305 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 1306 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 1307 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 1308 } 1309 } 1310 1311 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 1312 const SIProgramInfo &CurrentProgramInfo) { 1313 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1314 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1315 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 1316 MCContext &Ctx = MF.getContext(); 1317 1318 // (((Value) & Mask) << Shift) 1319 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) { 1320 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx); 1321 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx); 1322 return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx), 1323 shft, Ctx); 1324 }; 1325 1326 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) { 1327 int64_t Val; 1328 if (Value->evaluateAsAbsolute(Val)) 1329 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size); 1330 else 1331 OutStreamer->emitValue(Value, Size); 1332 }; 1333 1334 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 1335 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 1336 1337 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx), 1338 /*Size=*/4); 1339 1340 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 1341 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); 1342 1343 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 1344 1345 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1346 // appropriate generation. 1347 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1348 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1349 /*Mask=*/0x3FFFF, /*Shift=*/12), 1350 /*Size=*/4); 1351 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1352 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1353 /*Mask=*/0x7FFF, /*Shift=*/12), 1354 /*Size=*/4); 1355 } else { 1356 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1357 /*Mask=*/0x1FFF, /*Shift=*/12), 1358 /*Size=*/4); 1359 } 1360 1361 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 1362 // 0" comment but I don't see a corresponding field in the register spec. 1363 } else { 1364 OutStreamer->emitInt32(RsrcReg); 1365 1366 const MCExpr *GPRBlocks = MCBinaryExpr::createOr( 1367 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0), 1368 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6), 1369 MF.getContext()); 1370 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4); 1371 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 1372 1373 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1374 // appropriate generation. 1375 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1376 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1377 /*Mask=*/0x3FFFF, /*Shift=*/12), 1378 /*Size=*/4); 1379 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1380 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1381 /*Mask=*/0x7FFF, /*Shift=*/12), 1382 /*Size=*/4); 1383 } else { 1384 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1385 /*Mask=*/0x1FFF, /*Shift=*/12), 1386 /*Size=*/4); 1387 } 1388 } 1389 1390 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1391 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 1392 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1393 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1394 : CurrentProgramInfo.LDSBlocks; 1395 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1396 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 1397 OutStreamer->emitInt32(MFI->getPSInputEnable()); 1398 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 1399 OutStreamer->emitInt32(MFI->getPSInputAddr()); 1400 } 1401 1402 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1403 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1404 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1405 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1406 } 1407 1408 // Helper function to add common PAL Metadata 3.0+ 1409 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, 1410 const SIProgramInfo &CurrentProgramInfo, 1411 CallingConv::ID CC, const GCNSubtarget &ST, 1412 unsigned DynamicVGPRBlockSize) { 1413 if (ST.hasIEEEMode()) 1414 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); 1415 1416 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); 1417 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); 1418 1419 if (AMDGPU::isCompute(CC)) { 1420 MD->setHwStage(CC, ".trap_present", 1421 (bool)CurrentProgramInfo.TrapHandlerEnable); 1422 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); 1423 1424 if (DynamicVGPRBlockSize != 0) 1425 MD->setComputeRegisters(".dynamic_vgpr_en", true); 1426 } 1427 1428 MD->setHwStage(CC, ".lds_size", 1429 (unsigned)(CurrentProgramInfo.LdsSize * 1430 getLdsDwGranularity(ST) * sizeof(uint32_t))); 1431 } 1432 1433 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1434 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1435 // metadata items into the PALMD::Metadata, combining with any provided by the 1436 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1437 // is then written as a single block in the .note section. 1438 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1439 const SIProgramInfo &CurrentProgramInfo) { 1440 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1441 auto CC = MF.getFunction().getCallingConv(); 1442 auto *MD = getTargetStreamer()->getPALMetadata(); 1443 auto &Ctx = MF.getContext(); 1444 1445 MD->setEntryPoint(CC, MF.getFunction().getName()); 1446 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); 1447 1448 // For targets that support dynamic VGPRs, set the number of saved dynamic 1449 // VGPRs (if any) in the PAL metadata. 1450 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1451 if (MFI->isDynamicVGPREnabled() && 1452 MFI->getScratchReservedForDynamicVGPRs() > 0) 1453 MD->setHwStage(CC, ".dynamic_vgpr_saved_count", 1454 MFI->getScratchReservedForDynamicVGPRs() / 4); 1455 1456 // Only set AGPRs for supported devices 1457 if (STM.hasMAIInsts()) { 1458 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); 1459 } 1460 1461 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); 1462 if (MD->getPALMajorVersion() < 3) { 1463 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); 1464 if (AMDGPU::isCompute(CC)) { 1465 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1466 } else { 1467 const MCExpr *HasScratchBlocks = 1468 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, 1469 MCConstantExpr::create(0, Ctx), Ctx); 1470 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); 1471 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx); 1472 } 1473 } else { 1474 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); 1475 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, 1476 CurrentProgramInfo.ScratchEnable); 1477 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM, 1478 MFI->getDynamicVGPRBlockSize()); 1479 } 1480 1481 // ScratchSize is in bytes, 16 aligned. 1482 MD->setScratchSize( 1483 CC, 1484 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize, 1485 MCConstantExpr::create(16, Ctx), Ctx), 1486 Ctx); 1487 1488 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1489 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1490 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1491 : CurrentProgramInfo.LDSBlocks; 1492 if (MD->getPALMajorVersion() < 3) { 1493 MD->setRsrc2( 1494 CC, 1495 MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), 1496 Ctx); 1497 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1498 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1499 } else { 1500 // Graphics registers 1501 const unsigned ExtraLdsDwGranularity = 1502 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; 1503 MD->setGraphicsRegisters( 1504 ".ps_extra_lds_size", 1505 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); 1506 1507 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr 1508 static StringLiteral const PsInputFields[] = { 1509 ".persp_sample_ena", ".persp_center_ena", 1510 ".persp_centroid_ena", ".persp_pull_model_ena", 1511 ".linear_sample_ena", ".linear_center_ena", 1512 ".linear_centroid_ena", ".line_stipple_tex_ena", 1513 ".pos_x_float_ena", ".pos_y_float_ena", 1514 ".pos_z_float_ena", ".pos_w_float_ena", 1515 ".front_face_ena", ".ancillary_ena", 1516 ".sample_coverage_ena", ".pos_fixed_pt_ena"}; 1517 unsigned PSInputEna = MFI->getPSInputEnable(); 1518 unsigned PSInputAddr = MFI->getPSInputAddr(); 1519 for (auto [Idx, Field] : enumerate(PsInputFields)) { 1520 MD->setGraphicsRegisters(".spi_ps_input_ena", Field, 1521 (bool)((PSInputEna >> Idx) & 1)); 1522 MD->setGraphicsRegisters(".spi_ps_input_addr", Field, 1523 (bool)((PSInputAddr >> Idx) & 1)); 1524 } 1525 } 1526 } 1527 1528 // For version 3 and above the wave front size is already set in the metadata 1529 if (MD->getPALMajorVersion() < 3 && STM.isWave32()) 1530 MD->setWave32(MF.getFunction().getCallingConv()); 1531 } 1532 1533 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { 1534 auto *MD = getTargetStreamer()->getPALMetadata(); 1535 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1536 StringRef FnName = MF.getFunction().getName(); 1537 MD->setFunctionScratchSize(FnName, MFI.getStackSize()); 1538 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1539 MCContext &Ctx = MF.getContext(); 1540 1541 if (MD->getPALMajorVersion() < 3) { 1542 // Set compute registers 1543 MD->setRsrc1( 1544 CallingConv::AMDGPU_CS, 1545 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); 1546 MD->setRsrc2(CallingConv::AMDGPU_CS, 1547 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1548 } else { 1549 EmitPALMetadataCommon( 1550 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST, 1551 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize()); 1552 } 1553 1554 // Set optional info 1555 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); 1556 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1557 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1558 } 1559 1560 // This is supposed to be log2(Size) 1561 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1562 switch (Size) { 1563 case 4: 1564 return AMD_ELEMENT_4_BYTES; 1565 case 8: 1566 return AMD_ELEMENT_8_BYTES; 1567 case 16: 1568 return AMD_ELEMENT_16_BYTES; 1569 default: 1570 llvm_unreachable("invalid private_element_size"); 1571 } 1572 } 1573 1574 void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, 1575 const SIProgramInfo &CurrentProgramInfo, 1576 const MachineFunction &MF) const { 1577 const Function &F = MF.getFunction(); 1578 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1579 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1580 1581 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1582 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1583 MCContext &Ctx = MF.getContext(); 1584 1585 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false); 1586 1587 Out.compute_pgm_resource1_registers = 1588 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx); 1589 Out.compute_pgm_resource2_registers = 1590 CurrentProgramInfo.getComputePGMRSrc2(Ctx); 1591 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1592 1593 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack; 1594 1595 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1596 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1597 1598 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); 1599 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 1600 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1601 } 1602 1603 if (UserSGPRInfo.hasDispatchPtr()) 1604 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1605 1606 if (UserSGPRInfo.hasQueuePtr()) 1607 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1608 1609 if (UserSGPRInfo.hasKernargSegmentPtr()) 1610 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1611 1612 if (UserSGPRInfo.hasDispatchID()) 1613 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1614 1615 if (UserSGPRInfo.hasFlatScratchInit()) 1616 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1617 1618 if (UserSGPRInfo.hasPrivateSegmentSize()) 1619 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 1620 1621 if (STM.isXNACKEnabled()) 1622 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1623 1624 Align MaxKernArgAlign; 1625 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1626 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1627 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1628 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1629 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1630 1631 // kernarg_segment_alignment is specified as log of the alignment. 1632 // The minimum alignment is 16. 1633 // FIXME: The metadata treats the minimum as 4? 1634 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1635 } 1636 1637 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1638 const char *ExtraCode, raw_ostream &O) { 1639 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1640 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1641 return false; 1642 1643 if (ExtraCode && ExtraCode[0]) { 1644 if (ExtraCode[1] != 0) 1645 return true; // Unknown modifier. 1646 1647 switch (ExtraCode[0]) { 1648 case 'r': 1649 break; 1650 default: 1651 return true; 1652 } 1653 } 1654 1655 // TODO: Should be able to support other operand types like globals. 1656 const MachineOperand &MO = MI->getOperand(OpNo); 1657 if (MO.isReg()) { 1658 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1659 *MF->getSubtarget().getRegisterInfo()); 1660 return false; 1661 } 1662 if (MO.isImm()) { 1663 int64_t Val = MO.getImm(); 1664 if (AMDGPU::isInlinableIntLiteral(Val)) { 1665 O << Val; 1666 } else if (isUInt<16>(Val)) { 1667 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1668 } else if (isUInt<32>(Val)) { 1669 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1670 } else { 1671 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1672 } 1673 return false; 1674 } 1675 return true; 1676 } 1677 1678 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { 1679 AU.addRequired<AMDGPUResourceUsageAnalysisWrapperPass>(); 1680 AU.addPreserved<AMDGPUResourceUsageAnalysisWrapperPass>(); 1681 AU.addRequired<MachineModuleInfoWrapperPass>(); 1682 AU.addPreserved<MachineModuleInfoWrapperPass>(); 1683 AsmPrinter::getAnalysisUsage(AU); 1684 } 1685 1686 void AMDGPUAsmPrinter::emitResourceUsageRemarks( 1687 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, 1688 bool isModuleEntryFunction, bool hasMAIInsts) { 1689 if (!ORE) 1690 return; 1691 1692 const char *Name = "kernel-resource-usage"; 1693 const char *Indent = " "; 1694 1695 // If the remark is not specifically enabled, do not output to yaml 1696 LLVMContext &Ctx = MF.getFunction().getContext(); 1697 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) 1698 return; 1699 1700 // Currently non-kernel functions have no resources to emit. 1701 if (!isEntryFunctionCC(MF.getFunction().getCallingConv())) 1702 return; 1703 1704 auto EmitResourceUsageRemark = [&](StringRef RemarkName, 1705 StringRef RemarkLabel, auto Argument) { 1706 // Add an indent for every line besides the line with the kernel name. This 1707 // makes it easier to tell which resource usage go with which kernel since 1708 // the kernel name will always be displayed first. 1709 std::string LabelStr = RemarkLabel.str() + ": "; 1710 if (RemarkName != "FunctionName") 1711 LabelStr = Indent + LabelStr; 1712 1713 ORE->emit([&]() { 1714 return MachineOptimizationRemarkAnalysis(Name, RemarkName, 1715 MF.getFunction().getSubprogram(), 1716 &MF.front()) 1717 << LabelStr << ore::NV(RemarkName, Argument); 1718 }); 1719 }; 1720 1721 // FIXME: Formatting here is pretty nasty because clang does not accept 1722 // newlines from diagnostics. This forces us to emit multiple diagnostic 1723 // remarks to simulate newlines. If and when clang does accept newlines, this 1724 // formatting should be aggregated into one remark with newlines to avoid 1725 // printing multiple diagnostic location and diag opts. 1726 EmitResourceUsageRemark("FunctionName", "Function Name", 1727 MF.getFunction().getName()); 1728 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs", 1729 getMCExprStr(CurrentProgramInfo.NumSGPR)); 1730 EmitResourceUsageRemark("NumVGPR", "VGPRs", 1731 getMCExprStr(CurrentProgramInfo.NumArchVGPR)); 1732 if (hasMAIInsts) { 1733 EmitResourceUsageRemark("NumAGPR", "AGPRs", 1734 getMCExprStr(CurrentProgramInfo.NumAccVGPR)); 1735 } 1736 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", 1737 getMCExprStr(CurrentProgramInfo.ScratchSize)); 1738 int64_t DynStack; 1739 bool DynStackEvaluatable = 1740 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack); 1741 StringRef DynamicStackStr = 1742 DynStackEvaluatable && DynStack ? "True" : "False"; 1743 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); 1744 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", 1745 getMCExprStr(CurrentProgramInfo.Occupancy)); 1746 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", 1747 CurrentProgramInfo.SGPRSpill); 1748 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", 1749 CurrentProgramInfo.VGPRSpill); 1750 if (isModuleEntryFunction) 1751 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", 1752 CurrentProgramInfo.LDSSize); 1753 } 1754 1755 char AMDGPUAsmPrinter::ID = 0; 1756 1757 INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer", 1758 "AMDGPU Assembly Printer", false, false) 1759