1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 51 cl::desc("Enable the use of AA during codegen.")); 52 53 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 54 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 55 // Determine default and user-specified characteristics 56 57 if (CPUString.empty()) 58 CPUString = "generic"; 59 60 if (TuneCPUString.empty()) 61 TuneCPUString = CPUString; 62 63 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 64 initializeProperties(); 65 66 return *this; 67 } 68 69 void AArch64Subtarget::initializeProperties() { 70 // Initialize CPU specific properties. We should add a tablegen feature for 71 // this in the future so we can specify it together with the subtarget 72 // features. 73 switch (ARMProcFamily) { 74 case Others: 75 break; 76 case Carmel: 77 CacheLineSize = 64; 78 break; 79 case CortexA35: 80 break; 81 case CortexA53: 82 case CortexA55: 83 PrefFunctionLogAlignment = 4; 84 break; 85 case CortexA57: 86 MaxInterleaveFactor = 4; 87 PrefFunctionLogAlignment = 4; 88 break; 89 case CortexA65: 90 PrefFunctionLogAlignment = 3; 91 break; 92 case CortexA72: 93 case CortexA73: 94 case CortexA75: 95 case CortexA76: 96 case CortexA77: 97 case CortexA78: 98 case CortexA78C: 99 case CortexR82: 100 case CortexX1: 101 PrefFunctionLogAlignment = 4; 102 break; 103 case CortexA510: 104 case CortexA710: 105 case CortexX2: 106 PrefFunctionLogAlignment = 4; 107 VScaleForTuning = 1; 108 break; 109 case A64FX: 110 CacheLineSize = 256; 111 PrefFunctionLogAlignment = 3; 112 PrefLoopLogAlignment = 2; 113 MaxInterleaveFactor = 4; 114 PrefetchDistance = 128; 115 MinPrefetchStride = 1024; 116 MaxPrefetchIterationsAhead = 4; 117 VScaleForTuning = 4; 118 break; 119 case AppleA7: 120 case AppleA10: 121 case AppleA11: 122 case AppleA12: 123 case AppleA13: 124 case AppleA14: 125 CacheLineSize = 64; 126 PrefetchDistance = 280; 127 MinPrefetchStride = 2048; 128 MaxPrefetchIterationsAhead = 3; 129 break; 130 case ExynosM3: 131 MaxInterleaveFactor = 4; 132 MaxJumpTableSize = 20; 133 PrefFunctionLogAlignment = 5; 134 PrefLoopLogAlignment = 4; 135 break; 136 case Falkor: 137 MaxInterleaveFactor = 4; 138 // FIXME: remove this to enable 64-bit SLP if performance looks good. 139 MinVectorRegisterBitWidth = 128; 140 CacheLineSize = 128; 141 PrefetchDistance = 820; 142 MinPrefetchStride = 2048; 143 MaxPrefetchIterationsAhead = 8; 144 break; 145 case Kryo: 146 MaxInterleaveFactor = 4; 147 VectorInsertExtractBaseCost = 2; 148 CacheLineSize = 128; 149 PrefetchDistance = 740; 150 MinPrefetchStride = 1024; 151 MaxPrefetchIterationsAhead = 11; 152 // FIXME: remove this to enable 64-bit SLP if performance looks good. 153 MinVectorRegisterBitWidth = 128; 154 break; 155 case NeoverseE1: 156 PrefFunctionLogAlignment = 3; 157 break; 158 case NeoverseN1: 159 PrefFunctionLogAlignment = 4; 160 break; 161 case NeoverseN2: 162 PrefFunctionLogAlignment = 4; 163 VScaleForTuning = 1; 164 break; 165 case NeoverseV1: 166 PrefFunctionLogAlignment = 4; 167 VScaleForTuning = 2; 168 break; 169 case Neoverse512TVB: 170 PrefFunctionLogAlignment = 4; 171 VScaleForTuning = 1; 172 MaxInterleaveFactor = 4; 173 break; 174 case Saphira: 175 MaxInterleaveFactor = 4; 176 // FIXME: remove this to enable 64-bit SLP if performance looks good. 177 MinVectorRegisterBitWidth = 128; 178 break; 179 case ThunderX2T99: 180 CacheLineSize = 64; 181 PrefFunctionLogAlignment = 3; 182 PrefLoopLogAlignment = 2; 183 MaxInterleaveFactor = 4; 184 PrefetchDistance = 128; 185 MinPrefetchStride = 1024; 186 MaxPrefetchIterationsAhead = 4; 187 // FIXME: remove this to enable 64-bit SLP if performance looks good. 188 MinVectorRegisterBitWidth = 128; 189 break; 190 case ThunderX: 191 case ThunderXT88: 192 case ThunderXT81: 193 case ThunderXT83: 194 CacheLineSize = 128; 195 PrefFunctionLogAlignment = 3; 196 PrefLoopLogAlignment = 2; 197 // FIXME: remove this to enable 64-bit SLP if performance looks good. 198 MinVectorRegisterBitWidth = 128; 199 break; 200 case TSV110: 201 CacheLineSize = 64; 202 PrefFunctionLogAlignment = 4; 203 PrefLoopLogAlignment = 2; 204 break; 205 case ThunderX3T110: 206 CacheLineSize = 64; 207 PrefFunctionLogAlignment = 4; 208 PrefLoopLogAlignment = 2; 209 MaxInterleaveFactor = 4; 210 PrefetchDistance = 128; 211 MinPrefetchStride = 1024; 212 MaxPrefetchIterationsAhead = 4; 213 // FIXME: remove this to enable 64-bit SLP if performance looks good. 214 MinVectorRegisterBitWidth = 128; 215 break; 216 } 217 } 218 219 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 220 const std::string &TuneCPU, 221 const std::string &FS, 222 const TargetMachine &TM, bool LittleEndian, 223 unsigned MinSVEVectorSizeInBitsOverride, 224 unsigned MaxSVEVectorSizeInBitsOverride) 225 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 226 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 227 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 228 IsLittle(LittleEndian), 229 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 230 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 231 FrameLowering(), 232 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(), 233 TLInfo(TM, *this) { 234 if (AArch64::isX18ReservedByDefault(TT)) 235 ReserveXRegister.set(18); 236 237 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 238 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 239 Legalizer.reset(new AArch64LegalizerInfo(*this)); 240 241 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 242 243 // FIXME: At this point, we can't rely on Subtarget having RBI. 244 // It's awkward to mix passing RBI and the Subtarget; should we pass 245 // TII/TRI as well? 246 InstSelector.reset(createAArch64InstructionSelector( 247 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 248 249 RegBankInfo.reset(RBI); 250 } 251 252 const CallLowering *AArch64Subtarget::getCallLowering() const { 253 return CallLoweringInfo.get(); 254 } 255 256 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 257 return InlineAsmLoweringInfo.get(); 258 } 259 260 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 261 return InstSelector.get(); 262 } 263 264 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 265 return Legalizer.get(); 266 } 267 268 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 269 return RegBankInfo.get(); 270 } 271 272 /// Find the target operand flags that describe how a global value should be 273 /// referenced for the current subtarget. 274 unsigned 275 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 276 const TargetMachine &TM) const { 277 // MachO large model always goes via a GOT, simply to get a single 8-byte 278 // absolute relocation on all global addresses. 279 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 280 return AArch64II::MO_GOT; 281 282 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 283 if (GV->hasDLLImportStorageClass()) 284 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 285 if (getTargetTriple().isOSWindows()) 286 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 287 return AArch64II::MO_GOT; 288 } 289 290 // The small code model's direct accesses use ADRP, which cannot 291 // necessarily produce the value 0 (if the code is above 4GB). 292 // Same for the tiny code model, where we have a pc relative LDR. 293 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 294 GV->hasExternalWeakLinkage()) 295 return AArch64II::MO_GOT; 296 297 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 298 // that their nominal addresses are tagged and outside of the code model. In 299 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 300 // tag if necessary based on MO_TAGGED. 301 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 302 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 303 304 return AArch64II::MO_NO_FLAG; 305 } 306 307 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 308 const GlobalValue *GV, const TargetMachine &TM) const { 309 // MachO large model always goes via a GOT, because we don't have the 310 // relocations available to do anything else.. 311 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 312 !GV->hasInternalLinkage()) 313 return AArch64II::MO_GOT; 314 315 // NonLazyBind goes via GOT unless we know it's available locally. 316 auto *F = dyn_cast<Function>(GV); 317 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 318 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 319 return AArch64II::MO_GOT; 320 321 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 322 if (getTargetTriple().isOSWindows()) 323 return ClassifyGlobalReference(GV, TM); 324 325 return AArch64II::MO_NO_FLAG; 326 } 327 328 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 329 unsigned NumRegionInstrs) const { 330 // LNT run (at least on Cyclone) showed reasonably significant gains for 331 // bi-directional scheduling. 253.perlbmk. 332 Policy.OnlyTopDown = false; 333 Policy.OnlyBottomUp = false; 334 // Enabling or Disabling the latency heuristic is a close call: It seems to 335 // help nearly no benchmark on out-of-order architectures, on the other hand 336 // it regresses register pressure on a few benchmarking. 337 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 338 } 339 340 bool AArch64Subtarget::enableEarlyIfConversion() const { 341 return EnableEarlyIfConvert; 342 } 343 344 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 345 if (!UseAddressTopByteIgnored) 346 return false; 347 348 if (TargetTriple.isiOS()) { 349 unsigned Major, Minor, Micro; 350 TargetTriple.getiOSVersion(Major, Minor, Micro); 351 return Major >= 8; 352 } 353 354 return false; 355 } 356 357 std::unique_ptr<PBQPRAConstraint> 358 AArch64Subtarget::getCustomPBQPConstraints() const { 359 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 360 } 361 362 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 363 // We usually compute max call frame size after ISel. Do the computation now 364 // if the .mir file didn't specify it. Note that this will probably give you 365 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 366 // instructions, specify explicitly if you need it to be correct. 367 MachineFrameInfo &MFI = MF.getFrameInfo(); 368 if (!MFI.isMaxCallFrameSizeComputed()) 369 MFI.computeMaxCallFrameSize(MF); 370 } 371 372 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 373 // Prefer NEON unless larger SVE registers are available. 374 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 375 } 376 377 bool AArch64Subtarget::useAA() const { return UseAA; } 378