1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 51 cl::desc("Enable the use of AA during codegen.")); 52 53 AArch64Subtarget & 54 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, 55 StringRef CPUString) { 56 // Determine default and user-specified characteristics 57 58 if (CPUString.empty()) 59 CPUString = "generic"; 60 61 ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); 62 initializeProperties(); 63 64 return *this; 65 } 66 67 void AArch64Subtarget::initializeProperties() { 68 // Initialize CPU specific properties. We should add a tablegen feature for 69 // this in the future so we can specify it together with the subtarget 70 // features. 71 switch (ARMProcFamily) { 72 case Others: 73 break; 74 case Carmel: 75 CacheLineSize = 64; 76 break; 77 case CortexA35: 78 break; 79 case CortexA53: 80 case CortexA55: 81 PrefFunctionLogAlignment = 4; 82 break; 83 case CortexA57: 84 MaxInterleaveFactor = 4; 85 PrefFunctionLogAlignment = 4; 86 break; 87 case CortexA65: 88 PrefFunctionLogAlignment = 3; 89 break; 90 case CortexA72: 91 case CortexA73: 92 case CortexA75: 93 case CortexA76: 94 case CortexA77: 95 case CortexA78: 96 case CortexA78C: 97 case CortexR82: 98 case CortexX1: 99 PrefFunctionLogAlignment = 4; 100 break; 101 case A64FX: 102 CacheLineSize = 256; 103 PrefFunctionLogAlignment = 3; 104 PrefLoopLogAlignment = 2; 105 MaxInterleaveFactor = 4; 106 PrefetchDistance = 128; 107 MinPrefetchStride = 1024; 108 MaxPrefetchIterationsAhead = 4; 109 break; 110 case AppleA7: 111 case AppleA10: 112 case AppleA11: 113 case AppleA12: 114 case AppleA13: 115 case AppleA14: 116 CacheLineSize = 64; 117 PrefetchDistance = 280; 118 MinPrefetchStride = 2048; 119 MaxPrefetchIterationsAhead = 3; 120 break; 121 case ExynosM3: 122 MaxInterleaveFactor = 4; 123 MaxJumpTableSize = 20; 124 PrefFunctionLogAlignment = 5; 125 PrefLoopLogAlignment = 4; 126 break; 127 case Falkor: 128 MaxInterleaveFactor = 4; 129 // FIXME: remove this to enable 64-bit SLP if performance looks good. 130 MinVectorRegisterBitWidth = 128; 131 CacheLineSize = 128; 132 PrefetchDistance = 820; 133 MinPrefetchStride = 2048; 134 MaxPrefetchIterationsAhead = 8; 135 break; 136 case Kryo: 137 MaxInterleaveFactor = 4; 138 VectorInsertExtractBaseCost = 2; 139 CacheLineSize = 128; 140 PrefetchDistance = 740; 141 MinPrefetchStride = 1024; 142 MaxPrefetchIterationsAhead = 11; 143 // FIXME: remove this to enable 64-bit SLP if performance looks good. 144 MinVectorRegisterBitWidth = 128; 145 break; 146 case NeoverseE1: 147 PrefFunctionLogAlignment = 3; 148 break; 149 case NeoverseN1: 150 case NeoverseN2: 151 case NeoverseV1: 152 PrefFunctionLogAlignment = 4; 153 break; 154 case Saphira: 155 MaxInterleaveFactor = 4; 156 // FIXME: remove this to enable 64-bit SLP if performance looks good. 157 MinVectorRegisterBitWidth = 128; 158 break; 159 case ThunderX2T99: 160 CacheLineSize = 64; 161 PrefFunctionLogAlignment = 3; 162 PrefLoopLogAlignment = 2; 163 MaxInterleaveFactor = 4; 164 PrefetchDistance = 128; 165 MinPrefetchStride = 1024; 166 MaxPrefetchIterationsAhead = 4; 167 // FIXME: remove this to enable 64-bit SLP if performance looks good. 168 MinVectorRegisterBitWidth = 128; 169 break; 170 case ThunderX: 171 case ThunderXT88: 172 case ThunderXT81: 173 case ThunderXT83: 174 CacheLineSize = 128; 175 PrefFunctionLogAlignment = 3; 176 PrefLoopLogAlignment = 2; 177 // FIXME: remove this to enable 64-bit SLP if performance looks good. 178 MinVectorRegisterBitWidth = 128; 179 break; 180 case TSV110: 181 CacheLineSize = 64; 182 PrefFunctionLogAlignment = 4; 183 PrefLoopLogAlignment = 2; 184 break; 185 case ThunderX3T110: 186 CacheLineSize = 64; 187 PrefFunctionLogAlignment = 4; 188 PrefLoopLogAlignment = 2; 189 MaxInterleaveFactor = 4; 190 PrefetchDistance = 128; 191 MinPrefetchStride = 1024; 192 MaxPrefetchIterationsAhead = 4; 193 // FIXME: remove this to enable 64-bit SLP if performance looks good. 194 MinVectorRegisterBitWidth = 128; 195 break; 196 } 197 } 198 199 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 200 const std::string &FS, 201 const TargetMachine &TM, bool LittleEndian, 202 unsigned MinSVEVectorSizeInBitsOverride, 203 unsigned MaxSVEVectorSizeInBitsOverride) 204 : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), 205 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 206 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 207 IsLittle(LittleEndian), 208 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 209 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 210 FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), 211 TSInfo(), TLInfo(TM, *this) { 212 if (AArch64::isX18ReservedByDefault(TT)) 213 ReserveXRegister.set(18); 214 215 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 216 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 217 Legalizer.reset(new AArch64LegalizerInfo(*this)); 218 219 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 220 221 // FIXME: At this point, we can't rely on Subtarget having RBI. 222 // It's awkward to mix passing RBI and the Subtarget; should we pass 223 // TII/TRI as well? 224 InstSelector.reset(createAArch64InstructionSelector( 225 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 226 227 RegBankInfo.reset(RBI); 228 } 229 230 const CallLowering *AArch64Subtarget::getCallLowering() const { 231 return CallLoweringInfo.get(); 232 } 233 234 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 235 return InlineAsmLoweringInfo.get(); 236 } 237 238 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 239 return InstSelector.get(); 240 } 241 242 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 243 return Legalizer.get(); 244 } 245 246 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 247 return RegBankInfo.get(); 248 } 249 250 /// Find the target operand flags that describe how a global value should be 251 /// referenced for the current subtarget. 252 unsigned 253 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 254 const TargetMachine &TM) const { 255 // MachO large model always goes via a GOT, simply to get a single 8-byte 256 // absolute relocation on all global addresses. 257 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 258 return AArch64II::MO_GOT; 259 260 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 261 if (GV->hasDLLImportStorageClass()) 262 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 263 if (getTargetTriple().isOSWindows()) 264 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 265 return AArch64II::MO_GOT; 266 } 267 268 // The small code model's direct accesses use ADRP, which cannot 269 // necessarily produce the value 0 (if the code is above 4GB). 270 // Same for the tiny code model, where we have a pc relative LDR. 271 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 272 GV->hasExternalWeakLinkage()) 273 return AArch64II::MO_GOT; 274 275 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 276 // that their nominal addresses are tagged and outside of the code model. In 277 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 278 // tag if necessary based on MO_TAGGED. 279 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 280 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 281 282 return AArch64II::MO_NO_FLAG; 283 } 284 285 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 286 const GlobalValue *GV, const TargetMachine &TM) const { 287 // MachO large model always goes via a GOT, because we don't have the 288 // relocations available to do anything else.. 289 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 290 !GV->hasInternalLinkage()) 291 return AArch64II::MO_GOT; 292 293 // NonLazyBind goes via GOT unless we know it's available locally. 294 auto *F = dyn_cast<Function>(GV); 295 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 296 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 297 return AArch64II::MO_GOT; 298 299 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 300 if (getTargetTriple().isOSWindows()) 301 return ClassifyGlobalReference(GV, TM); 302 303 return AArch64II::MO_NO_FLAG; 304 } 305 306 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 307 unsigned NumRegionInstrs) const { 308 // LNT run (at least on Cyclone) showed reasonably significant gains for 309 // bi-directional scheduling. 253.perlbmk. 310 Policy.OnlyTopDown = false; 311 Policy.OnlyBottomUp = false; 312 // Enabling or Disabling the latency heuristic is a close call: It seems to 313 // help nearly no benchmark on out-of-order architectures, on the other hand 314 // it regresses register pressure on a few benchmarking. 315 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 316 } 317 318 bool AArch64Subtarget::enableEarlyIfConversion() const { 319 return EnableEarlyIfConvert; 320 } 321 322 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 323 if (!UseAddressTopByteIgnored) 324 return false; 325 326 if (TargetTriple.isiOS()) { 327 unsigned Major, Minor, Micro; 328 TargetTriple.getiOSVersion(Major, Minor, Micro); 329 return Major >= 8; 330 } 331 332 return false; 333 } 334 335 std::unique_ptr<PBQPRAConstraint> 336 AArch64Subtarget::getCustomPBQPConstraints() const { 337 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 338 } 339 340 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 341 // We usually compute max call frame size after ISel. Do the computation now 342 // if the .mir file didn't specify it. Note that this will probably give you 343 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 344 // instructions, specify explicitly if you need it to be correct. 345 MachineFrameInfo &MFI = MF.getFrameInfo(); 346 if (!MFI.isMaxCallFrameSizeComputed()) 347 MFI.computeMaxCallFrameSize(MF); 348 } 349 350 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 351 // Prefer NEON unless larger SVE registers are available. 352 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 353 } 354 355 bool AArch64Subtarget::useAA() const { return UseAA; } 356