1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 12 /// code. When passed an MCAsmStreamer it prints assembly and when passed 13 /// an MCObjectStreamer it outputs binary code. 14 // 15 //===----------------------------------------------------------------------===// 16 // 17 18 #include "AMDGPUAsmPrinter.h" 19 #include "AMDGPU.h" 20 #include "AMDGPUHSAMetadataStreamer.h" 21 #include "AMDGPUResourceUsageAnalysis.h" 22 #include "GCNSubtarget.h" 23 #include "MCTargetDesc/AMDGPUInstPrinter.h" 24 #include "MCTargetDesc/AMDGPUMCExpr.h" 25 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h" 26 #include "MCTargetDesc/AMDGPUTargetStreamer.h" 27 #include "R600AsmPrinter.h" 28 #include "SIMachineFunctionInfo.h" 29 #include "TargetInfo/AMDGPUTargetInfo.h" 30 #include "Utils/AMDGPUBaseInfo.h" 31 #include "Utils/AMDKernelCodeTUtils.h" 32 #include "Utils/SIDefinesUtils.h" 33 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 34 #include "llvm/BinaryFormat/ELF.h" 35 #include "llvm/CodeGen/MachineFrameInfo.h" 36 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" 37 #include "llvm/IR/DiagnosticInfo.h" 38 #include "llvm/MC/MCAssembler.h" 39 #include "llvm/MC/MCContext.h" 40 #include "llvm/MC/MCSectionELF.h" 41 #include "llvm/MC/MCStreamer.h" 42 #include "llvm/MC/TargetRegistry.h" 43 #include "llvm/Support/AMDHSAKernelDescriptor.h" 44 #include "llvm/Target/TargetLoweringObjectFile.h" 45 #include "llvm/Target/TargetMachine.h" 46 #include "llvm/TargetParser/TargetParser.h" 47 48 using namespace llvm; 49 using namespace llvm::AMDGPU; 50 51 // This should get the default rounding mode from the kernel. We just set the 52 // default here, but this could change if the OpenCL rounding mode pragmas are 53 // used. 54 // 55 // The denormal mode here should match what is reported by the OpenCL runtime 56 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 57 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 58 // 59 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 60 // precision, and leaves single precision to flush all and does not report 61 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 62 // CL_FP_DENORM for both. 63 // 64 // FIXME: It seems some instructions do not support single precision denormals 65 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 66 // and sin_f32, cos_f32 on most parts). 67 68 // We want to use these instructions, and using fp32 denormals also causes 69 // instructions to run at the double precision rate for the device so it's 70 // probably best to just report no single precision denormals. 71 static uint32_t getFPMode(SIModeRegisterDefaults Mode) { 72 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 73 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 74 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | 75 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue()); 76 } 77 78 static AsmPrinter * 79 createAMDGPUAsmPrinterPass(TargetMachine &tm, 80 std::unique_ptr<MCStreamer> &&Streamer) { 81 return new AMDGPUAsmPrinter(tm, std::move(Streamer)); 82 } 83 84 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { 85 TargetRegistry::RegisterAsmPrinter(getTheR600Target(), 86 llvm::createR600AsmPrinterPass); 87 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), 88 createAMDGPUAsmPrinterPass); 89 } 90 91 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, 92 std::unique_ptr<MCStreamer> Streamer) 93 : AsmPrinter(TM, std::move(Streamer)) { 94 assert(OutStreamer && "AsmPrinter constructed without streamer"); 95 } 96 97 StringRef AMDGPUAsmPrinter::getPassName() const { 98 return "AMDGPU Assembly Printer"; 99 } 100 101 const MCSubtargetInfo *AMDGPUAsmPrinter::getGlobalSTI() const { 102 return TM.getMCSubtargetInfo(); 103 } 104 105 AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { 106 if (!OutStreamer) 107 return nullptr; 108 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); 109 } 110 111 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) { 112 IsTargetStreamerInitialized = false; 113 } 114 115 void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { 116 IsTargetStreamerInitialized = true; 117 118 // TODO: Which one is called first, emitStartOfAsmFile or 119 // emitFunctionBodyStart? 120 if (getTargetStreamer() && !getTargetStreamer()->getTargetID()) 121 initializeTargetID(M); 122 123 if (TM.getTargetTriple().getOS() != Triple::AMDHSA && 124 TM.getTargetTriple().getOS() != Triple::AMDPAL) 125 return; 126 127 getTargetStreamer()->EmitDirectiveAMDGCNTarget(); 128 129 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 130 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion( 131 CodeObjectVersion); 132 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID()); 133 } 134 135 if (TM.getTargetTriple().getOS() == Triple::AMDPAL) 136 getTargetStreamer()->getPALMetadata()->readFromIR(M); 137 } 138 139 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { 140 // Init target streamer if it has not yet happened 141 if (!IsTargetStreamerInitialized) 142 initTargetStreamer(M); 143 144 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 145 getTargetStreamer()->EmitISAVersion(); 146 147 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). 148 // Emit HSA Metadata (NT_AMD_HSA_METADATA). 149 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 150 HSAMetadataStream->end(); 151 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); 152 (void)Success; 153 assert(Success && "Malformed HSA Metadata"); 154 } 155 } 156 157 void AMDGPUAsmPrinter::emitFunctionBodyStart() { 158 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 159 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 160 const Function &F = MF->getFunction(); 161 162 // TODO: We're checking this late, would be nice to check it earlier. 163 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) { 164 report_fatal_error( 165 STM.getCPU() + " is only available on code object version 6 or better", 166 /*gen_crash_diag*/ false); 167 } 168 169 // TODO: Which one is called first, emitStartOfAsmFile or 170 // emitFunctionBodyStart? 171 if (!getTargetStreamer()->getTargetID()) 172 initializeTargetID(*F.getParent()); 173 174 const auto &FunctionTargetID = STM.getTargetID(); 175 // Make sure function's xnack settings are compatible with module's 176 // xnack settings. 177 if (FunctionTargetID.isXnackSupported() && 178 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any && 179 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) { 180 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) + 181 "' function does not match module xnack setting"); 182 return; 183 } 184 // Make sure function's sramecc settings are compatible with module's 185 // sramecc settings. 186 if (FunctionTargetID.isSramEccSupported() && 187 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any && 188 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) { 189 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) + 190 "' function does not match module sramecc setting"); 191 return; 192 } 193 194 if (!MFI.isEntryFunction()) 195 return; 196 197 if (STM.isMesaKernel(F) && 198 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 199 F.getCallingConv() == CallingConv::SPIR_KERNEL)) { 200 AMDGPUMCKernelCodeT KernelCode; 201 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); 202 KernelCode.validate(&STM, MF->getContext()); 203 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); 204 } 205 206 if (STM.isAmdHsaOS()) 207 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); 208 209 if (MFI.getNumKernargPreloadedSGPRs() > 0) { 210 assert(AMDGPU::hasKernargPreload(STM)); 211 getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(), 212 STM.isAmdHsaOS()); 213 } 214 } 215 216 void AMDGPUAsmPrinter::emitFunctionBodyEnd() { 217 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); 218 if (!MFI.isEntryFunction()) 219 return; 220 221 if (TM.getTargetTriple().getOS() != Triple::AMDHSA) 222 return; 223 224 auto &Streamer = getTargetStreamer()->getStreamer(); 225 auto &Context = Streamer.getContext(); 226 auto &ObjectFileInfo = *Context.getObjectFileInfo(); 227 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); 228 229 Streamer.pushSection(); 230 Streamer.switchSection(&ReadOnlySection); 231 232 // CP microcode requires the kernel descriptor to be allocated on 64 byte 233 // alignment. 234 Streamer.emitValueToAlignment(Align(64), 0, 1, 0); 235 ReadOnlySection.ensureMinAlignment(Align(64)); 236 237 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 238 239 SmallString<128> KernelName; 240 getNameWithPrefix(KernelName, &MF->getFunction()); 241 getTargetStreamer()->EmitAmdhsaKernelDescriptor( 242 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), 243 CurrentProgramInfo.NumVGPRsForWavesPerEU, 244 MCBinaryExpr::createSub( 245 CurrentProgramInfo.NumSGPRsForWavesPerEU, 246 AMDGPUMCExpr::createExtraSGPRs( 247 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, 248 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context), 249 Context), 250 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); 251 252 Streamer.popSection(); 253 } 254 255 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { 256 Register RegNo = MI->getOperand(0).getReg(); 257 258 SmallString<128> Str; 259 raw_svector_ostream OS(Str); 260 OS << "implicit-def: " 261 << printReg(RegNo, MF->getSubtarget().getRegisterInfo()); 262 263 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL) 264 OS << " : SGPR spill to VGPR lane"; 265 266 OutStreamer->AddComment(OS.str()); 267 OutStreamer->addBlankLine(); 268 } 269 270 void AMDGPUAsmPrinter::emitFunctionEntryLabel() { 271 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 272 AsmPrinter::emitFunctionEntryLabel(); 273 return; 274 } 275 276 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); 277 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); 278 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { 279 SmallString<128> SymbolName; 280 getNameWithPrefix(SymbolName, &MF->getFunction()), 281 getTargetStreamer()->EmitAMDGPUSymbolType( 282 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); 283 } 284 if (DumpCodeInstEmitter) { 285 // Disassemble function name label to text. 286 DisasmLines.push_back(MF->getName().str() + ":"); 287 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 288 HexLines.emplace_back(""); 289 } 290 291 AsmPrinter::emitFunctionEntryLabel(); 292 } 293 294 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) { 295 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) { 296 // Write a line for the basic block label if it is not only fallthrough. 297 DisasmLines.push_back( 298 (Twine("BB") + Twine(getFunctionNumber()) 299 + "_" + Twine(MBB.getNumber()) + ":").str()); 300 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); 301 HexLines.emplace_back(""); 302 } 303 AsmPrinter::emitBasicBlockStart(MBB); 304 } 305 306 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { 307 if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 308 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) { 309 OutContext.reportError({}, 310 Twine(GV->getName()) + 311 ": unsupported initializer for address space"); 312 return; 313 } 314 315 // LDS variables aren't emitted in HSA or PAL yet. 316 const Triple::OSType OS = TM.getTargetTriple().getOS(); 317 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) 318 return; 319 320 MCSymbol *GVSym = getSymbol(GV); 321 322 GVSym->redefineIfPossible(); 323 if (GVSym->isDefined() || GVSym->isVariable()) 324 report_fatal_error("symbol '" + Twine(GVSym->getName()) + 325 "' is already defined"); 326 327 const DataLayout &DL = GV->getDataLayout(); 328 uint64_t Size = DL.getTypeAllocSize(GV->getValueType()); 329 Align Alignment = GV->getAlign().value_or(Align(4)); 330 331 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration()); 332 emitLinkage(GV, GVSym); 333 auto TS = getTargetStreamer(); 334 TS->emitAMDGPULDS(GVSym, Size, Alignment); 335 return; 336 } 337 338 AsmPrinter::emitGlobalVariable(GV); 339 } 340 341 bool AMDGPUAsmPrinter::doInitialization(Module &M) { 342 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M); 343 344 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { 345 switch (CodeObjectVersion) { 346 case AMDGPU::AMDHSA_COV4: 347 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>(); 348 break; 349 case AMDGPU::AMDHSA_COV5: 350 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>(); 351 break; 352 case AMDGPU::AMDHSA_COV6: 353 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>(); 354 break; 355 default: 356 report_fatal_error("Unexpected code object version"); 357 } 358 } 359 return AsmPrinter::doInitialization(M); 360 } 361 362 bool AMDGPUAsmPrinter::doFinalization(Module &M) { 363 // Pad with s_code_end to help tools and guard against instruction prefetch 364 // causing stale data in caches. Arguably this should be done by the linker, 365 // which is why this isn't done for Mesa. 366 const MCSubtargetInfo &STI = *getGlobalSTI(); 367 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) && 368 (STI.getTargetTriple().getOS() == Triple::AMDHSA || 369 STI.getTargetTriple().getOS() == Triple::AMDPAL)) { 370 OutStreamer->switchSection(getObjFileLowering().getTextSection()); 371 getTargetStreamer()->EmitCodeEnd(STI); 372 } 373 374 return AsmPrinter::doFinalization(M); 375 } 376 377 // Print comments that apply to both callable functions and entry points. 378 void AMDGPUAsmPrinter::emitCommonFunctionComments( 379 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR, 380 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize, 381 const AMDGPUMachineFunction *MFI) { 382 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 383 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); 384 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); 385 if (NumAGPR) { 386 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false); 387 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR), 388 false); 389 } 390 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); 391 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 392 false); 393 } 394 395 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) { 396 SmallString<128> Str; 397 raw_svector_ostream OSS(Str); 398 int64_t IVal; 399 if (Value->evaluateAsAbsolute(IVal)) { 400 OSS << static_cast<uint64_t>(IVal); 401 } else { 402 Value->print(OSS, MAI); 403 } 404 return Str; 405 } 406 407 void AMDGPUAsmPrinter::emitCommonFunctionComments( 408 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR, 409 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize, 410 const AMDGPUMachineFunction *MFI) { 411 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); 412 OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false); 413 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false); 414 if (NumAGPR && TotalNumVGPR) { 415 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false); 416 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR), 417 false); 418 } 419 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize), 420 false); 421 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), 422 false); 423 } 424 425 const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( 426 const MachineFunction &MF) const { 427 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); 428 MCContext &Ctx = MF.getContext(); 429 uint16_t KernelCodeProperties = 0; 430 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); 431 432 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 433 KernelCodeProperties |= 434 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 435 } 436 if (UserSGPRInfo.hasDispatchPtr()) { 437 KernelCodeProperties |= 438 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 439 } 440 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { 441 KernelCodeProperties |= 442 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 443 } 444 if (UserSGPRInfo.hasKernargSegmentPtr()) { 445 KernelCodeProperties |= 446 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 447 } 448 if (UserSGPRInfo.hasDispatchID()) { 449 KernelCodeProperties |= 450 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 451 } 452 if (UserSGPRInfo.hasFlatScratchInit()) { 453 KernelCodeProperties |= 454 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 455 } 456 if (UserSGPRInfo.hasPrivateSegmentSize()) { 457 KernelCodeProperties |= 458 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 459 } 460 if (MF.getSubtarget<GCNSubtarget>().isWave32()) { 461 KernelCodeProperties |= 462 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32; 463 } 464 465 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be 466 // un-evaluatable at this point so it cannot be conditionally checked here. 467 // Instead, we'll directly shift the possibly unknown MCExpr into its place 468 // and bitwise-or it into KernelCodeProperties. 469 const MCExpr *KernelCodePropExpr = 470 MCConstantExpr::create(KernelCodeProperties, Ctx); 471 const MCExpr *OrValue = MCConstantExpr::create( 472 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx); 473 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack, 474 OrValue, Ctx); 475 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx); 476 477 return KernelCodePropExpr; 478 } 479 480 MCKernelDescriptor 481 AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF, 482 const SIProgramInfo &PI) const { 483 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 484 const Function &F = MF.getFunction(); 485 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 486 MCContext &Ctx = MF.getContext(); 487 488 MCKernelDescriptor KernelDescriptor; 489 490 KernelDescriptor.group_segment_fixed_size = 491 MCConstantExpr::create(PI.LDSSize, Ctx); 492 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; 493 494 Align MaxKernArgAlign; 495 KernelDescriptor.kernarg_size = MCConstantExpr::create( 496 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx); 497 498 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx); 499 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx); 500 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); 501 502 int64_t PGRM_Rsrc3 = 1; 503 bool EvaluatableRsrc3 = 504 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3); 505 (void)PGRM_Rsrc3; 506 (void)EvaluatableRsrc3; 507 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 || 508 static_cast<uint64_t>(PGRM_Rsrc3) == 0); 509 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A; 510 511 KernelDescriptor.kernarg_preload = MCConstantExpr::create( 512 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0, 513 Ctx); 514 515 return KernelDescriptor; 516 } 517 518 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 519 // Init target streamer lazily on the first function so that previous passes 520 // can set metadata. 521 if (!IsTargetStreamerInitialized) 522 initTargetStreamer(*MF.getFunction().getParent()); 523 524 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>(); 525 CurrentProgramInfo.reset(MF); 526 527 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); 528 MCContext &Ctx = MF.getContext(); 529 530 // The starting address of all shader programs must be 256 bytes aligned. 531 // Regular functions just need the basic required instruction alignment. 532 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4)); 533 534 SetupMachineFunction(MF); 535 536 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 537 MCContext &Context = getObjFileLowering().getContext(); 538 // FIXME: This should be an explicit check for Mesa. 539 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { 540 MCSectionELF *ConfigSection = 541 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); 542 OutStreamer->switchSection(ConfigSection); 543 } 544 545 if (MFI->isModuleEntryFunction()) { 546 getSIProgramInfo(CurrentProgramInfo, MF); 547 } 548 549 if (STM.isAmdPalOS()) { 550 if (MFI->isEntryFunction()) 551 EmitPALMetadata(MF, CurrentProgramInfo); 552 else if (MFI->isModuleEntryFunction()) 553 emitPALFunctionMetadata(MF); 554 } else if (!STM.isAmdHsaOS()) { 555 EmitProgramInfoSI(MF, CurrentProgramInfo); 556 } 557 558 DumpCodeInstEmitter = nullptr; 559 if (STM.dumpCode()) { 560 // For -dumpcode, get the assembler out of the streamer. This only works 561 // with -filetype=obj. 562 MCAssembler *Assembler = OutStreamer->getAssemblerPtr(); 563 if (Assembler) 564 DumpCodeInstEmitter = Assembler->getEmitterPtr(); 565 } 566 567 DisasmLines.clear(); 568 HexLines.clear(); 569 DisasmLineMaxLen = 0; 570 571 emitFunctionBody(); 572 573 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(), 574 STM.hasMAIInsts()); 575 576 if (isVerbose()) { 577 MCSectionELF *CommentSection = 578 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); 579 OutStreamer->switchSection(CommentSection); 580 581 if (!MFI->isEntryFunction()) { 582 OutStreamer->emitRawComment(" Function info:", false); 583 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 584 ResourceUsage->getResourceInfo(&MF.getFunction()); 585 emitCommonFunctionComments( 586 Info.NumVGPR, 587 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(), 588 Info.getTotalNumVGPRs(STM), 589 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), 590 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI); 591 return false; 592 } 593 594 OutStreamer->emitRawComment(" Kernel info:", false); 595 emitCommonFunctionComments( 596 CurrentProgramInfo.NumArchVGPR, 597 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr, 598 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, 599 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI); 600 601 OutStreamer->emitRawComment( 602 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); 603 OutStreamer->emitRawComment( 604 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); 605 OutStreamer->emitRawComment( 606 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + 607 " bytes/workgroup (compile time only)", false); 608 609 OutStreamer->emitRawComment( 610 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false); 611 612 OutStreamer->emitRawComment( 613 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false); 614 615 OutStreamer->emitRawComment( 616 " NumSGPRsForWavesPerEU: " + 617 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU), 618 false); 619 OutStreamer->emitRawComment( 620 " NumVGPRsForWavesPerEU: " + 621 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU), 622 false); 623 624 if (STM.hasGFX90AInsts()) { 625 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd( 626 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx); 627 AdjustedAccum = MCBinaryExpr::createMul( 628 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx); 629 OutStreamer->emitRawComment( 630 " AccumOffset: " + getMCExprStr(AdjustedAccum), false); 631 } 632 633 OutStreamer->emitRawComment( 634 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false); 635 636 OutStreamer->emitRawComment( 637 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); 638 639 OutStreamer->emitRawComment( 640 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + 641 getMCExprStr(CurrentProgramInfo.ScratchEnable), 642 false); 643 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + 644 Twine(CurrentProgramInfo.UserSGPR), 645 false); 646 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + 647 Twine(CurrentProgramInfo.TrapHandlerEnable), 648 false); 649 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + 650 Twine(CurrentProgramInfo.TGIdXEnable), 651 false); 652 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + 653 Twine(CurrentProgramInfo.TGIdYEnable), 654 false); 655 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + 656 Twine(CurrentProgramInfo.TGIdZEnable), 657 false); 658 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + 659 Twine(CurrentProgramInfo.TIdIGCompCount), 660 false); 661 662 [[maybe_unused]] int64_t PGMRSrc3; 663 assert(STM.hasGFX90AInsts() || 664 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute( 665 PGMRSrc3) && 666 static_cast<uint64_t>(PGMRSrc3) == 0)); 667 if (STM.hasGFX90AInsts()) { 668 OutStreamer->emitRawComment( 669 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " + 670 getMCExprStr(MCKernelDescriptor::bits_get( 671 CurrentProgramInfo.ComputePGMRSrc3GFX90A, 672 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, 673 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)), 674 false); 675 OutStreamer->emitRawComment( 676 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " + 677 getMCExprStr(MCKernelDescriptor::bits_get( 678 CurrentProgramInfo.ComputePGMRSrc3GFX90A, 679 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, 680 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)), 681 false); 682 } 683 } 684 685 if (DumpCodeInstEmitter) { 686 687 OutStreamer->switchSection( 688 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0)); 689 690 for (size_t i = 0; i < DisasmLines.size(); ++i) { 691 std::string Comment = "\n"; 692 if (!HexLines[i].empty()) { 693 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 694 Comment += " ; " + HexLines[i] + "\n"; 695 } 696 697 OutStreamer->emitBytes(StringRef(DisasmLines[i])); 698 OutStreamer->emitBytes(StringRef(Comment)); 699 } 700 } 701 702 return false; 703 } 704 705 // TODO: Fold this into emitFunctionBodyStart. 706 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { 707 // In the beginning all features are either 'Any' or 'NotSupported', 708 // depending on global target features. This will cover empty modules. 709 getTargetStreamer()->initializeTargetID(*getGlobalSTI(), 710 getGlobalSTI()->getFeatureString()); 711 712 // If module is empty, we are done. 713 if (M.empty()) 714 return; 715 716 // If module is not empty, need to find first 'Off' or 'On' feature 717 // setting per feature from functions in module. 718 for (auto &F : M) { 719 auto &TSTargetID = getTargetStreamer()->getTargetID(); 720 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) && 721 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff())) 722 break; 723 724 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F); 725 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID(); 726 if (TSTargetID->isXnackSupported()) 727 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any) 728 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting()); 729 if (TSTargetID->isSramEccSupported()) 730 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any) 731 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting()); 732 } 733 } 734 735 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { 736 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 737 const SIInstrInfo *TII = STM.getInstrInfo(); 738 739 uint64_t CodeSize = 0; 740 741 for (const MachineBasicBlock &MBB : MF) { 742 for (const MachineInstr &MI : MBB) { 743 // TODO: CodeSize should account for multiple functions. 744 745 // TODO: Should we count size of debug info? 746 if (MI.isDebugInstr()) 747 continue; 748 749 CodeSize += TII->getInstSizeInBytes(MI); 750 } 751 } 752 753 return CodeSize; 754 } 755 756 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 757 const MachineFunction &MF) { 758 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info = 759 ResourceUsage->getResourceInfo(&MF.getFunction()); 760 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 761 MCContext &Ctx = MF.getContext(); 762 763 auto CreateExpr = [&Ctx](int64_t Value) { 764 return MCConstantExpr::create(Value, Ctx); 765 }; 766 767 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool { 768 int64_t Val; 769 if (Value->evaluateAsAbsolute(Val)) { 770 Res = Val; 771 return true; 772 } 773 return false; 774 }; 775 776 ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR); 777 ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR); 778 ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM)); 779 ProgInfo.AccumOffset = 780 CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1); 781 ProgInfo.TgSplit = STM.isTgSplitEnabled(); 782 ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR); 783 ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize); 784 ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC); 785 ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch); 786 ProgInfo.DynamicCallStack = 787 CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion); 788 789 const uint64_t MaxScratchPerWorkitem = 790 STM.getMaxWaveScratchSize() / STM.getWavefrontSize(); 791 uint64_t ScratchSize; 792 if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) && 793 ScratchSize > MaxScratchPerWorkitem) { 794 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize, 795 MaxScratchPerWorkitem, DS_Error); 796 MF.getFunction().getContext().diagnose(DiagStackSize); 797 } 798 799 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 800 801 // The calculations related to SGPR/VGPR blocks are 802 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be 803 // unified. 804 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs( 805 ProgInfo.VCCUsed, ProgInfo.FlatUsed, 806 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx); 807 808 // Check the addressable register limit before we add ExtraSGPRs. 809 if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 810 !STM.hasSGPRInitBug()) { 811 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 812 uint64_t NumSgpr; 813 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 814 NumSgpr > MaxAddressableNumSGPRs) { 815 // This can happen due to a compiler bug or when using inline asm. 816 LLVMContext &Ctx = MF.getFunction().getContext(); 817 DiagnosticInfoResourceLimit Diag( 818 MF.getFunction(), "addressable scalar registers", NumSgpr, 819 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit); 820 Ctx.diagnose(Diag); 821 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1); 822 } 823 } 824 825 // Account for extra SGPRs and VGPRs reserved for debugger use. 826 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx); 827 828 const Function &F = MF.getFunction(); 829 830 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave 831 // dispatch registers are function args. 832 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; 833 834 if (isShader(F.getCallingConv())) { 835 bool IsPixelShader = 836 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS(); 837 838 // Calculate the number of VGPR registers based on the SPI input registers 839 uint32_t InputEna = 0; 840 uint32_t InputAddr = 0; 841 unsigned LastEna = 0; 842 843 if (IsPixelShader) { 844 // Note for IsPixelShader: 845 // By this stage, all enabled inputs are tagged in InputAddr as well. 846 // We will use InputAddr to determine whether the input counts against the 847 // vgpr total and only use the InputEnable to determine the last input 848 // that is relevant - if extra arguments are used, then we have to honour 849 // the InputAddr for any intermediate non-enabled inputs. 850 InputEna = MFI->getPSInputEnable(); 851 InputAddr = MFI->getPSInputAddr(); 852 853 // We only need to consider input args up to the last used arg. 854 assert((InputEna || InputAddr) && 855 "PSInputAddr and PSInputEnable should " 856 "never both be 0 for AMDGPU_PS shaders"); 857 // There are some rare circumstances where InputAddr is non-zero and 858 // InputEna can be set to 0. In this case we default to setting LastEna 859 // to 1. 860 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; 861 } 862 863 // FIXME: We should be using the number of registers determined during 864 // calling convention lowering to legalize the types. 865 const DataLayout &DL = F.getDataLayout(); 866 unsigned PSArgCount = 0; 867 unsigned IntermediateVGPR = 0; 868 for (auto &Arg : F.args()) { 869 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32; 870 if (Arg.hasAttribute(Attribute::InReg)) { 871 WaveDispatchNumSGPR += NumRegs; 872 } else { 873 // If this is a PS shader and we're processing the PS Input args (first 874 // 16 VGPR), use the InputEna and InputAddr bits to define how many 875 // VGPRs are actually used. 876 // Any extra VGPR arguments are handled as normal arguments (and 877 // contribute to the VGPR count whether they're used or not). 878 if (IsPixelShader && PSArgCount < 16) { 879 if ((1 << PSArgCount) & InputAddr) { 880 if (PSArgCount < LastEna) 881 WaveDispatchNumVGPR += NumRegs; 882 else 883 IntermediateVGPR += NumRegs; 884 } 885 PSArgCount++; 886 } else { 887 // If there are extra arguments we have to include the allocation for 888 // the non-used (but enabled with InputAddr) input arguments 889 if (IntermediateVGPR) { 890 WaveDispatchNumVGPR += IntermediateVGPR; 891 IntermediateVGPR = 0; 892 } 893 WaveDispatchNumVGPR += NumRegs; 894 } 895 } 896 } 897 ProgInfo.NumSGPR = AMDGPUMCExpr::createMax( 898 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx); 899 900 ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax( 901 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx); 902 903 ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR( 904 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx); 905 } 906 907 // Adjust number of registers used to meet default/requested minimum/maximum 908 // number of waves per execution unit request. 909 unsigned MaxWaves = MFI->getMaxWavesPerEU(); 910 ProgInfo.NumSGPRsForWavesPerEU = 911 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul), 912 CreateExpr(STM.getMinNumSGPRs(MaxWaves))}, 913 Ctx); 914 ProgInfo.NumVGPRsForWavesPerEU = 915 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul), 916 CreateExpr(STM.getMinNumVGPRs(MaxWaves))}, 917 Ctx); 918 919 if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || 920 STM.hasSGPRInitBug()) { 921 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); 922 uint64_t NumSgpr; 923 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) && 924 NumSgpr > MaxAddressableNumSGPRs) { 925 // This can happen due to a compiler bug or when using inline asm to use 926 // the registers which are usually reserved for vcc etc. 927 LLVMContext &Ctx = MF.getFunction().getContext(); 928 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", 929 NumSgpr, MaxAddressableNumSGPRs, 930 DS_Error, DK_ResourceLimit); 931 Ctx.diagnose(Diag); 932 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs); 933 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs); 934 } 935 } 936 937 if (STM.hasSGPRInitBug()) { 938 ProgInfo.NumSGPR = 939 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 940 ProgInfo.NumSGPRsForWavesPerEU = 941 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG); 942 } 943 944 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { 945 LLVMContext &Ctx = MF.getFunction().getContext(); 946 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", 947 MFI->getNumUserSGPRs(), 948 STM.getMaxNumUserSGPRs(), DS_Error); 949 Ctx.diagnose(Diag); 950 } 951 952 if (MFI->getLDSSize() > 953 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) { 954 LLVMContext &Ctx = MF.getFunction().getContext(); 955 DiagnosticInfoResourceLimit Diag( 956 MF.getFunction(), "local memory", MFI->getLDSSize(), 957 STM.getAddressableLocalMemorySize(), DS_Error); 958 Ctx.diagnose(Diag); 959 } 960 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks: 961 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1 962 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR, 963 unsigned Granule) { 964 const MCExpr *OneConst = CreateExpr(1ul); 965 const MCExpr *GranuleConst = CreateExpr(Granule); 966 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx); 967 const MCExpr *AlignToGPR = 968 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx); 969 const MCExpr *DivGPR = 970 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx); 971 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx); 972 return SubGPR; 973 }; 974 975 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU, 976 IsaInfo::getSGPREncodingGranule(&STM)); 977 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU, 978 IsaInfo::getVGPREncodingGranule(&STM)); 979 980 const SIModeRegisterDefaults Mode = MFI->getMode(); 981 982 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 983 // register. 984 ProgInfo.FloatMode = getFPMode(Mode); 985 986 ProgInfo.IEEEMode = Mode.IEEE; 987 988 // Make clamp modifier on NaN input returns 0. 989 ProgInfo.DX10Clamp = Mode.DX10Clamp; 990 991 unsigned LDSAlignShift; 992 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 993 // LDS is allocated in 64 dword blocks. 994 LDSAlignShift = 8; 995 } else { 996 // LDS is allocated in 128 dword blocks. 997 LDSAlignShift = 9; 998 } 999 1000 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs(); 1001 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs(); 1002 1003 ProgInfo.LDSSize = MFI->getLDSSize(); 1004 ProgInfo.LDSBlocks = 1005 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; 1006 1007 // The MCExpr equivalent of divideCeil. 1008 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) { 1009 const MCExpr *Ceil = 1010 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx); 1011 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx); 1012 }; 1013 1014 // Scratch is allocated in 64-dword or 256-dword blocks. 1015 unsigned ScratchAlignShift = 1016 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10; 1017 // We need to program the hardware with the amount of scratch memory that 1018 // is used by the entire wave. ProgInfo.ScratchSize is the amount of 1019 // scratch memory used per thread. 1020 ProgInfo.ScratchBlocks = DivideCeil( 1021 MCBinaryExpr::createMul(ProgInfo.ScratchSize, 1022 CreateExpr(STM.getWavefrontSize()), Ctx), 1023 CreateExpr(1ULL << ScratchAlignShift)); 1024 1025 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { 1026 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; 1027 ProgInfo.MemOrdered = 1; 1028 } 1029 1030 // 0 = X, 1 = XY, 2 = XYZ 1031 unsigned TIDIGCompCnt = 0; 1032 if (MFI->hasWorkItemIDZ()) 1033 TIDIGCompCnt = 2; 1034 else if (MFI->hasWorkItemIDY()) 1035 TIDIGCompCnt = 1; 1036 1037 // The private segment wave byte offset is the last of the system SGPRs. We 1038 // initially assumed it was allocated, and may have used it. It shouldn't harm 1039 // anything to disable it if we know the stack isn't used here. We may still 1040 // have emitted code reading it to initialize scratch, but if that's unused 1041 // reading garbage should be OK. 1042 ProgInfo.ScratchEnable = MCBinaryExpr::createLOr( 1043 MCBinaryExpr::createGT(ProgInfo.ScratchBlocks, 1044 MCConstantExpr::create(0, Ctx), Ctx), 1045 ProgInfo.DynamicCallStack, Ctx); 1046 1047 ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); 1048 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. 1049 ProgInfo.TrapHandlerEnable = 1050 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); 1051 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); 1052 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); 1053 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); 1054 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); 1055 ProgInfo.TIdIGCompCount = TIDIGCompCnt; 1056 ProgInfo.EXCPEnMSB = 0; 1057 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. 1058 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; 1059 ProgInfo.EXCPEnable = 0; 1060 1061 if (STM.hasGFX90AInsts()) { 1062 // return ((Dst & ~Mask) | (Value << Shift)) 1063 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, 1064 uint32_t Shift) { 1065 auto Shft = MCConstantExpr::create(Shift, Ctx); 1066 auto Msk = MCConstantExpr::create(Mask, Ctx); 1067 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx); 1068 Dst = MCBinaryExpr::createOr( 1069 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx); 1070 return Dst; 1071 }; 1072 1073 ProgInfo.ComputePGMRSrc3GFX90A = 1074 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset, 1075 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, 1076 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT); 1077 ProgInfo.ComputePGMRSrc3GFX90A = 1078 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit), 1079 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, 1080 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT); 1081 } 1082 1083 ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy( 1084 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU, 1085 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx); 1086 1087 const auto [MinWEU, MaxWEU] = 1088 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true); 1089 uint64_t Occupancy; 1090 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) { 1091 DiagnosticInfoOptimizationFailure Diag( 1092 F, F.getSubprogram(), 1093 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in " 1094 "'" + 1095 F.getName() + "': desired occupancy was " + Twine(MinWEU) + 1096 ", final occupancy is " + Twine(Occupancy)); 1097 F.getContext().diagnose(Diag); 1098 } 1099 } 1100 1101 static unsigned getRsrcReg(CallingConv::ID CallConv) { 1102 switch (CallConv) { 1103 default: [[fallthrough]]; 1104 case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; 1105 case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; 1106 case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; 1107 case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; 1108 case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; 1109 case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; 1110 case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; 1111 } 1112 } 1113 1114 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 1115 const SIProgramInfo &CurrentProgramInfo) { 1116 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1117 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1118 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); 1119 MCContext &Ctx = MF.getContext(); 1120 1121 // (((Value) & Mask) << Shift) 1122 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) { 1123 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx); 1124 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx); 1125 return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx), 1126 shft, Ctx); 1127 }; 1128 1129 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) { 1130 int64_t Val; 1131 if (Value->evaluateAsAbsolute(Val)) 1132 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size); 1133 else 1134 OutStreamer->emitValue(Value, Size); 1135 }; 1136 1137 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { 1138 OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1); 1139 1140 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx), 1141 /*Size=*/4); 1142 1143 OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); 1144 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4); 1145 1146 OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); 1147 1148 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1149 // appropriate generation. 1150 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1151 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1152 /*Mask=*/0x3FFFF, /*Shift=*/12), 1153 /*Size=*/4); 1154 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1155 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1156 /*Mask=*/0x7FFF, /*Shift=*/12), 1157 /*Size=*/4); 1158 } else { 1159 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1160 /*Mask=*/0x1FFF, /*Shift=*/12), 1161 /*Size=*/4); 1162 } 1163 1164 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 1165 // 0" comment but I don't see a corresponding field in the register spec. 1166 } else { 1167 OutStreamer->emitInt32(RsrcReg); 1168 1169 const MCExpr *GPRBlocks = MCBinaryExpr::createOr( 1170 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0), 1171 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6), 1172 MF.getContext()); 1173 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4); 1174 OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE); 1175 1176 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the 1177 // appropriate generation. 1178 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) { 1179 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1180 /*Mask=*/0x3FFFF, /*Shift=*/12), 1181 /*Size=*/4); 1182 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) { 1183 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1184 /*Mask=*/0x7FFF, /*Shift=*/12), 1185 /*Size=*/4); 1186 } else { 1187 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks, 1188 /*Mask=*/0x1FFF, /*Shift=*/12), 1189 /*Size=*/4); 1190 } 1191 } 1192 1193 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1194 OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS); 1195 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1196 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1197 : CurrentProgramInfo.LDSBlocks; 1198 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); 1199 OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA); 1200 OutStreamer->emitInt32(MFI->getPSInputEnable()); 1201 OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR); 1202 OutStreamer->emitInt32(MFI->getPSInputAddr()); 1203 } 1204 1205 OutStreamer->emitInt32(R_SPILLED_SGPRS); 1206 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs()); 1207 OutStreamer->emitInt32(R_SPILLED_VGPRS); 1208 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs()); 1209 } 1210 1211 // Helper function to add common PAL Metadata 3.0+ 1212 static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, 1213 const SIProgramInfo &CurrentProgramInfo, 1214 CallingConv::ID CC, const GCNSubtarget &ST) { 1215 if (ST.hasIEEEMode()) 1216 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); 1217 1218 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); 1219 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); 1220 1221 if (AMDGPU::isCompute(CC)) { 1222 MD->setHwStage(CC, ".trap_present", 1223 (bool)CurrentProgramInfo.TrapHandlerEnable); 1224 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); 1225 } 1226 1227 MD->setHwStage(CC, ".lds_size", 1228 (unsigned)(CurrentProgramInfo.LdsSize * 1229 getLdsDwGranularity(ST) * sizeof(uint32_t))); 1230 } 1231 1232 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type 1233 // is AMDPAL. It stores each compute/SPI register setting and other PAL 1234 // metadata items into the PALMD::Metadata, combining with any provided by the 1235 // frontend as LLVM metadata. Once all functions are written, the PAL metadata 1236 // is then written as a single block in the .note section. 1237 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, 1238 const SIProgramInfo &CurrentProgramInfo) { 1239 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1240 auto CC = MF.getFunction().getCallingConv(); 1241 auto MD = getTargetStreamer()->getPALMetadata(); 1242 auto &Ctx = MF.getContext(); 1243 1244 MD->setEntryPoint(CC, MF.getFunction().getName()); 1245 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx); 1246 1247 // Only set AGPRs for supported devices 1248 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1249 if (STM.hasMAIInsts()) { 1250 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR); 1251 } 1252 1253 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx); 1254 if (MD->getPALMajorVersion() < 3) { 1255 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx); 1256 if (AMDGPU::isCompute(CC)) { 1257 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1258 } else { 1259 const MCExpr *HasScratchBlocks = 1260 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks, 1261 MCConstantExpr::create(0, Ctx), Ctx); 1262 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN); 1263 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx); 1264 } 1265 } else { 1266 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); 1267 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean, 1268 CurrentProgramInfo.ScratchEnable); 1269 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM); 1270 } 1271 1272 // ScratchSize is in bytes, 16 aligned. 1273 MD->setScratchSize( 1274 CC, 1275 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize, 1276 MCConstantExpr::create(16, Ctx), Ctx), 1277 Ctx); 1278 1279 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { 1280 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 1281 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) 1282 : CurrentProgramInfo.LDSBlocks; 1283 if (MD->getPALMajorVersion() < 3) { 1284 MD->setRsrc2( 1285 CC, 1286 MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize), Ctx), 1287 Ctx); 1288 MD->setSpiPsInputEna(MFI->getPSInputEnable()); 1289 MD->setSpiPsInputAddr(MFI->getPSInputAddr()); 1290 } else { 1291 // Graphics registers 1292 const unsigned ExtraLdsDwGranularity = 1293 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; 1294 MD->setGraphicsRegisters( 1295 ".ps_extra_lds_size", 1296 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); 1297 1298 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr 1299 static StringLiteral const PsInputFields[] = { 1300 ".persp_sample_ena", ".persp_center_ena", 1301 ".persp_centroid_ena", ".persp_pull_model_ena", 1302 ".linear_sample_ena", ".linear_center_ena", 1303 ".linear_centroid_ena", ".line_stipple_tex_ena", 1304 ".pos_x_float_ena", ".pos_y_float_ena", 1305 ".pos_z_float_ena", ".pos_w_float_ena", 1306 ".front_face_ena", ".ancillary_ena", 1307 ".sample_coverage_ena", ".pos_fixed_pt_ena"}; 1308 unsigned PSInputEna = MFI->getPSInputEnable(); 1309 unsigned PSInputAddr = MFI->getPSInputAddr(); 1310 for (auto [Idx, Field] : enumerate(PsInputFields)) { 1311 MD->setGraphicsRegisters(".spi_ps_input_ena", Field, 1312 (bool)((PSInputEna >> Idx) & 1)); 1313 MD->setGraphicsRegisters(".spi_ps_input_addr", Field, 1314 (bool)((PSInputAddr >> Idx) & 1)); 1315 } 1316 } 1317 } 1318 1319 // For version 3 and above the wave front size is already set in the metadata 1320 if (MD->getPALMajorVersion() < 3 && STM.isWave32()) 1321 MD->setWave32(MF.getFunction().getCallingConv()); 1322 } 1323 1324 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { 1325 auto *MD = getTargetStreamer()->getPALMetadata(); 1326 const MachineFrameInfo &MFI = MF.getFrameInfo(); 1327 StringRef FnName = MF.getFunction().getName(); 1328 MD->setFunctionScratchSize(FnName, MFI.getStackSize()); 1329 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1330 MCContext &Ctx = MF.getContext(); 1331 1332 if (MD->getPALMajorVersion() < 3) { 1333 // Set compute registers 1334 MD->setRsrc1( 1335 CallingConv::AMDGPU_CS, 1336 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx); 1337 MD->setRsrc2(CallingConv::AMDGPU_CS, 1338 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx); 1339 } else { 1340 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST); 1341 } 1342 1343 // Set optional info 1344 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize); 1345 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU); 1346 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU); 1347 } 1348 1349 // This is supposed to be log2(Size) 1350 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { 1351 switch (Size) { 1352 case 4: 1353 return AMD_ELEMENT_4_BYTES; 1354 case 8: 1355 return AMD_ELEMENT_8_BYTES; 1356 case 16: 1357 return AMD_ELEMENT_16_BYTES; 1358 default: 1359 llvm_unreachable("invalid private_element_size"); 1360 } 1361 } 1362 1363 void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, 1364 const SIProgramInfo &CurrentProgramInfo, 1365 const MachineFunction &MF) const { 1366 const Function &F = MF.getFunction(); 1367 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || 1368 F.getCallingConv() == CallingConv::SPIR_KERNEL); 1369 1370 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1371 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 1372 MCContext &Ctx = MF.getContext(); 1373 1374 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false); 1375 1376 Out.compute_pgm_resource1_registers = 1377 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx); 1378 Out.compute_pgm_resource2_registers = 1379 CurrentProgramInfo.getComputePGMRSrc2(Ctx); 1380 Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; 1381 1382 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack; 1383 1384 AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, 1385 getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); 1386 1387 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); 1388 if (UserSGPRInfo.hasPrivateSegmentBuffer()) { 1389 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; 1390 } 1391 1392 if (UserSGPRInfo.hasDispatchPtr()) 1393 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1394 1395 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) 1396 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; 1397 1398 if (UserSGPRInfo.hasKernargSegmentPtr()) 1399 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; 1400 1401 if (UserSGPRInfo.hasDispatchID()) 1402 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; 1403 1404 if (UserSGPRInfo.hasFlatScratchInit()) 1405 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; 1406 1407 if (UserSGPRInfo.hasPrivateSegmentSize()) 1408 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE; 1409 1410 if (UserSGPRInfo.hasDispatchPtr()) 1411 Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; 1412 1413 if (STM.isXNACKEnabled()) 1414 Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; 1415 1416 Align MaxKernArgAlign; 1417 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); 1418 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; 1419 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; 1420 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; 1421 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; 1422 1423 // kernarg_segment_alignment is specified as log of the alignment. 1424 // The minimum alignment is 16. 1425 // FIXME: The metadata treats the minimum as 4? 1426 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign)); 1427 } 1428 1429 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 1430 const char *ExtraCode, raw_ostream &O) { 1431 // First try the generic code, which knows about modifiers like 'c' and 'n'. 1432 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O)) 1433 return false; 1434 1435 if (ExtraCode && ExtraCode[0]) { 1436 if (ExtraCode[1] != 0) 1437 return true; // Unknown modifier. 1438 1439 switch (ExtraCode[0]) { 1440 case 'r': 1441 break; 1442 default: 1443 return true; 1444 } 1445 } 1446 1447 // TODO: Should be able to support other operand types like globals. 1448 const MachineOperand &MO = MI->getOperand(OpNo); 1449 if (MO.isReg()) { 1450 AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, 1451 *MF->getSubtarget().getRegisterInfo()); 1452 return false; 1453 } 1454 if (MO.isImm()) { 1455 int64_t Val = MO.getImm(); 1456 if (AMDGPU::isInlinableIntLiteral(Val)) { 1457 O << Val; 1458 } else if (isUInt<16>(Val)) { 1459 O << format("0x%" PRIx16, static_cast<uint16_t>(Val)); 1460 } else if (isUInt<32>(Val)) { 1461 O << format("0x%" PRIx32, static_cast<uint32_t>(Val)); 1462 } else { 1463 O << format("0x%" PRIx64, static_cast<uint64_t>(Val)); 1464 } 1465 return false; 1466 } 1467 return true; 1468 } 1469 1470 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const { 1471 AU.addRequired<AMDGPUResourceUsageAnalysis>(); 1472 AU.addPreserved<AMDGPUResourceUsageAnalysis>(); 1473 AsmPrinter::getAnalysisUsage(AU); 1474 } 1475 1476 void AMDGPUAsmPrinter::emitResourceUsageRemarks( 1477 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo, 1478 bool isModuleEntryFunction, bool hasMAIInsts) { 1479 if (!ORE) 1480 return; 1481 1482 const char *Name = "kernel-resource-usage"; 1483 const char *Indent = " "; 1484 1485 // If the remark is not specifically enabled, do not output to yaml 1486 LLVMContext &Ctx = MF.getFunction().getContext(); 1487 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name)) 1488 return; 1489 1490 // Currently non-kernel functions have no resources to emit. 1491 if (!isEntryFunctionCC(MF.getFunction().getCallingConv())) 1492 return; 1493 1494 auto EmitResourceUsageRemark = [&](StringRef RemarkName, 1495 StringRef RemarkLabel, auto Argument) { 1496 // Add an indent for every line besides the line with the kernel name. This 1497 // makes it easier to tell which resource usage go with which kernel since 1498 // the kernel name will always be displayed first. 1499 std::string LabelStr = RemarkLabel.str() + ": "; 1500 if (RemarkName != "FunctionName") 1501 LabelStr = Indent + LabelStr; 1502 1503 ORE->emit([&]() { 1504 return MachineOptimizationRemarkAnalysis(Name, RemarkName, 1505 MF.getFunction().getSubprogram(), 1506 &MF.front()) 1507 << LabelStr << ore::NV(RemarkName, Argument); 1508 }); 1509 }; 1510 1511 // FIXME: Formatting here is pretty nasty because clang does not accept 1512 // newlines from diagnostics. This forces us to emit multiple diagnostic 1513 // remarks to simulate newlines. If and when clang does accept newlines, this 1514 // formatting should be aggregated into one remark with newlines to avoid 1515 // printing multiple diagnostic location and diag opts. 1516 EmitResourceUsageRemark("FunctionName", "Function Name", 1517 MF.getFunction().getName()); 1518 EmitResourceUsageRemark("NumSGPR", "SGPRs", 1519 getMCExprStr(CurrentProgramInfo.NumSGPR)); 1520 EmitResourceUsageRemark("NumVGPR", "VGPRs", 1521 getMCExprStr(CurrentProgramInfo.NumArchVGPR)); 1522 if (hasMAIInsts) { 1523 EmitResourceUsageRemark("NumAGPR", "AGPRs", 1524 getMCExprStr(CurrentProgramInfo.NumAccVGPR)); 1525 } 1526 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]", 1527 getMCExprStr(CurrentProgramInfo.ScratchSize)); 1528 int64_t DynStack; 1529 bool DynStackEvaluatable = 1530 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack); 1531 StringRef DynamicStackStr = 1532 DynStackEvaluatable && DynStack ? "True" : "False"; 1533 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr); 1534 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]", 1535 getMCExprStr(CurrentProgramInfo.Occupancy)); 1536 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill", 1537 CurrentProgramInfo.SGPRSpill); 1538 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill", 1539 CurrentProgramInfo.VGPRSpill); 1540 if (isModuleEntryFunction) 1541 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]", 1542 CurrentProgramInfo.LDSSize); 1543 } 1544