1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/AArch64TargetParser.h" 27 #include "llvm/Support/TargetParser.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "aarch64-subtarget" 32 33 #define GET_SUBTARGETINFO_CTOR 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #include "AArch64GenSubtargetInfo.inc" 36 37 static cl::opt<bool> 38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 39 "converter pass"), cl::init(true), cl::Hidden); 40 41 // If OS supports TBI, use this flag to enable it. 42 static cl::opt<bool> 43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 44 "an address is ignored"), cl::init(false), cl::Hidden); 45 46 static cl::opt<bool> 47 UseNonLazyBind("aarch64-enable-nonlazybind", 48 cl::desc("Call nonlazybind functions via direct GOT load"), 49 cl::init(false), cl::Hidden); 50 51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 52 cl::desc("Enable the use of AA during codegen.")); 53 54 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 55 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 56 // Determine default and user-specified characteristics 57 58 if (CPUString.empty()) 59 CPUString = "generic"; 60 61 if (TuneCPUString.empty()) 62 TuneCPUString = CPUString; 63 64 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 65 initializeProperties(); 66 67 return *this; 68 } 69 70 void AArch64Subtarget::initializeProperties() { 71 // Initialize CPU specific properties. We should add a tablegen feature for 72 // this in the future so we can specify it together with the subtarget 73 // features. 74 switch (ARMProcFamily) { 75 case Others: 76 break; 77 case Carmel: 78 CacheLineSize = 64; 79 break; 80 case CortexA35: 81 break; 82 case CortexA53: 83 case CortexA55: 84 PrefFunctionLogAlignment = 4; 85 break; 86 case CortexA57: 87 MaxInterleaveFactor = 4; 88 PrefFunctionLogAlignment = 4; 89 break; 90 case CortexA65: 91 PrefFunctionLogAlignment = 3; 92 break; 93 case CortexA72: 94 case CortexA73: 95 case CortexA75: 96 case CortexA76: 97 case CortexA77: 98 case CortexA78: 99 case CortexA78C: 100 case CortexR82: 101 case CortexX1: 102 case CortexX1C: 103 PrefFunctionLogAlignment = 4; 104 break; 105 case CortexA510: 106 case CortexA710: 107 case CortexX2: 108 PrefFunctionLogAlignment = 4; 109 VScaleForTuning = 1; 110 break; 111 case A64FX: 112 CacheLineSize = 256; 113 PrefFunctionLogAlignment = 3; 114 PrefLoopLogAlignment = 2; 115 MaxInterleaveFactor = 4; 116 PrefetchDistance = 128; 117 MinPrefetchStride = 1024; 118 MaxPrefetchIterationsAhead = 4; 119 VScaleForTuning = 4; 120 break; 121 case AppleA7: 122 case AppleA10: 123 case AppleA11: 124 case AppleA12: 125 case AppleA13: 126 case AppleA14: 127 CacheLineSize = 64; 128 PrefetchDistance = 280; 129 MinPrefetchStride = 2048; 130 MaxPrefetchIterationsAhead = 3; 131 break; 132 case ExynosM3: 133 MaxInterleaveFactor = 4; 134 MaxJumpTableSize = 20; 135 PrefFunctionLogAlignment = 5; 136 PrefLoopLogAlignment = 4; 137 break; 138 case Falkor: 139 MaxInterleaveFactor = 4; 140 // FIXME: remove this to enable 64-bit SLP if performance looks good. 141 MinVectorRegisterBitWidth = 128; 142 CacheLineSize = 128; 143 PrefetchDistance = 820; 144 MinPrefetchStride = 2048; 145 MaxPrefetchIterationsAhead = 8; 146 break; 147 case Kryo: 148 MaxInterleaveFactor = 4; 149 VectorInsertExtractBaseCost = 2; 150 CacheLineSize = 128; 151 PrefetchDistance = 740; 152 MinPrefetchStride = 1024; 153 MaxPrefetchIterationsAhead = 11; 154 // FIXME: remove this to enable 64-bit SLP if performance looks good. 155 MinVectorRegisterBitWidth = 128; 156 break; 157 case NeoverseE1: 158 PrefFunctionLogAlignment = 3; 159 break; 160 case NeoverseN1: 161 PrefFunctionLogAlignment = 4; 162 PrefLoopLogAlignment = 5; 163 MaxBytesForLoopAlignment = 16; 164 break; 165 case NeoverseN2: 166 PrefFunctionLogAlignment = 4; 167 PrefLoopLogAlignment = 5; 168 MaxBytesForLoopAlignment = 16; 169 VScaleForTuning = 1; 170 break; 171 case NeoverseV1: 172 PrefFunctionLogAlignment = 4; 173 PrefLoopLogAlignment = 5; 174 MaxBytesForLoopAlignment = 16; 175 VScaleForTuning = 2; 176 break; 177 case Neoverse512TVB: 178 PrefFunctionLogAlignment = 4; 179 VScaleForTuning = 1; 180 MaxInterleaveFactor = 4; 181 break; 182 case Saphira: 183 MaxInterleaveFactor = 4; 184 // FIXME: remove this to enable 64-bit SLP if performance looks good. 185 MinVectorRegisterBitWidth = 128; 186 break; 187 case ThunderX2T99: 188 CacheLineSize = 64; 189 PrefFunctionLogAlignment = 3; 190 PrefLoopLogAlignment = 2; 191 MaxInterleaveFactor = 4; 192 PrefetchDistance = 128; 193 MinPrefetchStride = 1024; 194 MaxPrefetchIterationsAhead = 4; 195 // FIXME: remove this to enable 64-bit SLP if performance looks good. 196 MinVectorRegisterBitWidth = 128; 197 break; 198 case ThunderX: 199 case ThunderXT88: 200 case ThunderXT81: 201 case ThunderXT83: 202 CacheLineSize = 128; 203 PrefFunctionLogAlignment = 3; 204 PrefLoopLogAlignment = 2; 205 // FIXME: remove this to enable 64-bit SLP if performance looks good. 206 MinVectorRegisterBitWidth = 128; 207 break; 208 case TSV110: 209 CacheLineSize = 64; 210 PrefFunctionLogAlignment = 4; 211 PrefLoopLogAlignment = 2; 212 break; 213 case ThunderX3T110: 214 CacheLineSize = 64; 215 PrefFunctionLogAlignment = 4; 216 PrefLoopLogAlignment = 2; 217 MaxInterleaveFactor = 4; 218 PrefetchDistance = 128; 219 MinPrefetchStride = 1024; 220 MaxPrefetchIterationsAhead = 4; 221 // FIXME: remove this to enable 64-bit SLP if performance looks good. 222 MinVectorRegisterBitWidth = 128; 223 break; 224 case Ampere1: 225 CacheLineSize = 64; 226 PrefFunctionLogAlignment = 6; 227 PrefLoopLogAlignment = 6; 228 MaxInterleaveFactor = 4; 229 break; 230 } 231 } 232 233 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 234 const std::string &TuneCPU, 235 const std::string &FS, 236 const TargetMachine &TM, bool LittleEndian, 237 unsigned MinSVEVectorSizeInBitsOverride, 238 unsigned MaxSVEVectorSizeInBitsOverride) 239 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 240 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 241 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 242 IsLittle(LittleEndian), 243 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 244 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 245 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 246 TLInfo(TM, *this) { 247 if (AArch64::isX18ReservedByDefault(TT)) 248 ReserveXRegister.set(18); 249 250 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 251 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 252 Legalizer.reset(new AArch64LegalizerInfo(*this)); 253 254 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 255 256 // FIXME: At this point, we can't rely on Subtarget having RBI. 257 // It's awkward to mix passing RBI and the Subtarget; should we pass 258 // TII/TRI as well? 259 InstSelector.reset(createAArch64InstructionSelector( 260 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 261 262 RegBankInfo.reset(RBI); 263 } 264 265 const CallLowering *AArch64Subtarget::getCallLowering() const { 266 return CallLoweringInfo.get(); 267 } 268 269 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 270 return InlineAsmLoweringInfo.get(); 271 } 272 273 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 274 return InstSelector.get(); 275 } 276 277 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 278 return Legalizer.get(); 279 } 280 281 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 282 return RegBankInfo.get(); 283 } 284 285 /// Find the target operand flags that describe how a global value should be 286 /// referenced for the current subtarget. 287 unsigned 288 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 289 const TargetMachine &TM) const { 290 // MachO large model always goes via a GOT, simply to get a single 8-byte 291 // absolute relocation on all global addresses. 292 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 293 return AArch64II::MO_GOT; 294 295 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 296 if (GV->hasDLLImportStorageClass()) 297 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 298 if (getTargetTriple().isOSWindows()) 299 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 300 return AArch64II::MO_GOT; 301 } 302 303 // The small code model's direct accesses use ADRP, which cannot 304 // necessarily produce the value 0 (if the code is above 4GB). 305 // Same for the tiny code model, where we have a pc relative LDR. 306 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 307 GV->hasExternalWeakLinkage()) 308 return AArch64II::MO_GOT; 309 310 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 311 // that their nominal addresses are tagged and outside of the code model. In 312 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 313 // tag if necessary based on MO_TAGGED. 314 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 315 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 316 317 return AArch64II::MO_NO_FLAG; 318 } 319 320 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 321 const GlobalValue *GV, const TargetMachine &TM) const { 322 // MachO large model always goes via a GOT, because we don't have the 323 // relocations available to do anything else.. 324 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 325 !GV->hasInternalLinkage()) 326 return AArch64II::MO_GOT; 327 328 // NonLazyBind goes via GOT unless we know it's available locally. 329 auto *F = dyn_cast<Function>(GV); 330 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 331 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 332 return AArch64II::MO_GOT; 333 334 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 335 if (getTargetTriple().isOSWindows()) 336 return ClassifyGlobalReference(GV, TM); 337 338 return AArch64II::MO_NO_FLAG; 339 } 340 341 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 342 unsigned NumRegionInstrs) const { 343 // LNT run (at least on Cyclone) showed reasonably significant gains for 344 // bi-directional scheduling. 253.perlbmk. 345 Policy.OnlyTopDown = false; 346 Policy.OnlyBottomUp = false; 347 // Enabling or Disabling the latency heuristic is a close call: It seems to 348 // help nearly no benchmark on out-of-order architectures, on the other hand 349 // it regresses register pressure on a few benchmarking. 350 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 351 } 352 353 bool AArch64Subtarget::enableEarlyIfConversion() const { 354 return EnableEarlyIfConvert; 355 } 356 357 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 358 if (!UseAddressTopByteIgnored) 359 return false; 360 361 if (TargetTriple.isiOS()) { 362 return TargetTriple.getiOSVersion() >= VersionTuple(8); 363 } 364 365 return false; 366 } 367 368 std::unique_ptr<PBQPRAConstraint> 369 AArch64Subtarget::getCustomPBQPConstraints() const { 370 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 371 } 372 373 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 374 // We usually compute max call frame size after ISel. Do the computation now 375 // if the .mir file didn't specify it. Note that this will probably give you 376 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 377 // instructions, specify explicitly if you need it to be correct. 378 MachineFrameInfo &MFI = MF.getFrameInfo(); 379 if (!MFI.isMaxCallFrameSizeComputed()) 380 MFI.computeMaxCallFrameSize(MF); 381 } 382 383 bool AArch64Subtarget::useAA() const { return UseAA; } 384