1 //===-- SIProgramInfo.cpp ----------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// 11 /// The SIProgramInfo tracks resource usage and hardware flags for kernels and 12 /// entry functions. 13 // 14 //===----------------------------------------------------------------------===// 15 // 16 17 #include "SIProgramInfo.h" 18 #include "GCNSubtarget.h" 19 #include "SIDefines.h" 20 #include "Utils/AMDGPUBaseInfo.h" 21 #include "llvm/MC/MCExpr.h" 22 23 using namespace llvm; 24 25 void SIProgramInfo::reset(const MachineFunction &MF) { 26 MCContext &Ctx = MF.getContext(); 27 28 const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx); 29 30 CodeSizeInBytes.reset(); 31 32 VGPRBlocks = ZeroExpr; 33 SGPRBlocks = ZeroExpr; 34 Priority = 0; 35 FloatMode = 0; 36 Priv = 0; 37 DX10Clamp = 0; 38 DebugMode = 0; 39 IEEEMode = 0; 40 WgpMode = 0; 41 MemOrdered = 0; 42 FwdProgress = 0; 43 RrWgMode = 0; 44 ScratchSize = ZeroExpr; 45 46 LDSBlocks = 0; 47 ScratchBlocks = ZeroExpr; 48 49 ScratchEnable = ZeroExpr; 50 UserSGPR = 0; 51 TrapHandlerEnable = 0; 52 TGIdXEnable = 0; 53 TGIdYEnable = 0; 54 TGIdZEnable = 0; 55 TGSizeEnable = 0; 56 TIdIGCompCount = 0; 57 EXCPEnMSB = 0; 58 LdsSize = 0; 59 EXCPEnable = 0; 60 61 ComputePGMRSrc3 = ZeroExpr; 62 63 NumVGPR = ZeroExpr; 64 NumArchVGPR = ZeroExpr; 65 NumAccVGPR = ZeroExpr; 66 AccumOffset = ZeroExpr; 67 TgSplit = 0; 68 NumSGPR = ZeroExpr; 69 SGPRSpill = 0; 70 VGPRSpill = 0; 71 LDSSize = 0; 72 FlatUsed = ZeroExpr; 73 74 NumSGPRsForWavesPerEU = ZeroExpr; 75 NumVGPRsForWavesPerEU = ZeroExpr; 76 Occupancy = ZeroExpr; 77 DynamicCallStack = ZeroExpr; 78 VCCUsed = ZeroExpr; 79 } 80 81 static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo, 82 const GCNSubtarget &ST) { 83 uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) | 84 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | 85 S_00B848_PRIV(ProgInfo.Priv) | 86 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | 87 S_00B848_WGP_MODE(ProgInfo.WgpMode) | 88 S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); 89 90 if (ST.hasDX10ClampMode()) 91 Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp); 92 93 if (ST.hasIEEEMode()) 94 Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); 95 96 // TODO: in the long run we will want to enable this unconditionally. 97 if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA) 98 Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress); 99 100 if (ST.hasRrWGMode()) 101 Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); 102 103 return Reg; 104 } 105 106 static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo, 107 CallingConv::ID CC, const GCNSubtarget &ST) { 108 uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) | 109 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | 110 S_00B848_PRIV(ProgInfo.Priv) | 111 S_00B848_DEBUG_MODE(ProgInfo.DebugMode); 112 113 if (ST.hasDX10ClampMode()) 114 Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp); 115 116 if (ST.hasIEEEMode()) 117 Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode); 118 119 if (ST.hasRrWGMode()) 120 Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode); 121 122 switch (CC) { 123 case CallingConv::AMDGPU_PS: 124 Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered); 125 break; 126 case CallingConv::AMDGPU_VS: 127 Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered); 128 break; 129 case CallingConv::AMDGPU_GS: 130 Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) | 131 S_00B228_MEM_ORDERED(ProgInfo.MemOrdered); 132 break; 133 case CallingConv::AMDGPU_HS: 134 Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) | 135 S_00B428_MEM_ORDERED(ProgInfo.MemOrdered); 136 break; 137 default: 138 break; 139 } 140 return Reg; 141 } 142 143 static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) { 144 uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) | 145 S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) | 146 S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) | 147 S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) | 148 S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) | 149 S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) | 150 S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) | 151 S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) | 152 S_00B84C_LDS_SIZE(ProgInfo.LdsSize) | 153 S_00B84C_EXCP_EN(ProgInfo.EXCPEnable); 154 155 return Reg; 156 } 157 158 static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, 159 MCContext &Ctx) { 160 if (Mask) { 161 const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx); 162 Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx); 163 } 164 if (Shift) { 165 const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx); 166 Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx); 167 } 168 return Val; 169 } 170 171 const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST, 172 MCContext &Ctx) const { 173 uint64_t Reg = getComputePGMRSrc1Reg(*this, ST); 174 const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); 175 const MCExpr *Res = MCBinaryExpr::createOr( 176 MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx), 177 MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx); 178 return MCBinaryExpr::createOr(RegExpr, Res, Ctx); 179 } 180 181 const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC, 182 const GCNSubtarget &ST, 183 MCContext &Ctx) const { 184 if (AMDGPU::isCompute(CC)) { 185 return getComputePGMRSrc1(ST, Ctx); 186 } 187 188 uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST); 189 const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); 190 const MCExpr *Res = MCBinaryExpr::createOr( 191 MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx), 192 MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx); 193 return MCBinaryExpr::createOr(RegExpr, Res, Ctx); 194 } 195 196 const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const { 197 uint64_t Reg = getComputePGMRSrc2Reg(*this); 198 const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx); 199 return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx); 200 } 201 202 const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC, 203 MCContext &Ctx) const { 204 if (AMDGPU::isCompute(CC)) 205 return getComputePGMRSrc2(Ctx); 206 207 return MCConstantExpr::create(0, Ctx); 208 } 209 210 uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF, 211 bool IsLowerBound) { 212 if (!IsLowerBound && CodeSizeInBytes.has_value()) 213 return *CodeSizeInBytes; 214 215 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); 216 const SIInstrInfo *TII = STM.getInstrInfo(); 217 218 uint64_t CodeSize = 0; 219 220 for (const MachineBasicBlock &MBB : MF) { 221 // The amount of padding to align code can be both underestimated and 222 // overestimated. In case of inline asm used getInstSizeInBytes() will 223 // return a maximum size of a single instruction, where the real size may 224 // differ. At this point CodeSize may be already off. 225 if (!IsLowerBound) 226 CodeSize = alignTo(CodeSize, MBB.getAlignment()); 227 228 for (const MachineInstr &MI : MBB) { 229 // TODO: CodeSize should account for multiple functions. 230 231 if (MI.isMetaInstruction()) 232 continue; 233 234 // We cannot properly estimate inline asm size. It can be as small as zero 235 // if that is just a comment. 236 if (IsLowerBound && MI.isInlineAsm()) 237 continue; 238 239 CodeSize += TII->getInstSizeInBytes(MI); 240 } 241 } 242 243 CodeSizeInBytes = CodeSize; 244 return CodeSize; 245 } 246