1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/Support/AArch64TargetParser.h" 28 #include "llvm/Support/TargetParser.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "aarch64-subtarget" 33 34 #define GET_SUBTARGETINFO_CTOR 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #include "AArch64GenSubtargetInfo.inc" 37 38 static cl::opt<bool> 39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 40 "converter pass"), cl::init(true), cl::Hidden); 41 42 // If OS supports TBI, use this flag to enable it. 43 static cl::opt<bool> 44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 45 "an address is ignored"), cl::init(false), cl::Hidden); 46 47 static cl::opt<bool> 48 UseNonLazyBind("aarch64-enable-nonlazybind", 49 cl::desc("Call nonlazybind functions via direct GOT load"), 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 53 cl::desc("Enable the use of AA during codegen.")); 54 55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( 56 "aarch64-insert-extract-base-cost", 57 cl::desc("Base cost of vector insert/extract element"), cl::Hidden); 58 59 // Reserve a list of X# registers, so they are unavailable for register 60 // allocator, but can still be used as ABI requests, such as passing arguments 61 // to function call. 62 static cl::list<std::string> 63 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical " 64 "registers, so they can't be used by register allocator. " 65 "Should only be used for testing register allocator."), 66 cl::CommaSeparated, cl::Hidden); 67 68 static cl::opt<bool> 69 ForceStreamingCompatibleSVE("force-streaming-compatible-sve", 70 cl::init(false), cl::Hidden); 71 72 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { 73 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) 74 return OverrideVectorInsertExtractBaseCost; 75 return VectorInsertExtractBaseCost; 76 } 77 78 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 79 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 80 // Determine default and user-specified characteristics 81 82 if (CPUString.empty()) 83 CPUString = "generic"; 84 85 if (TuneCPUString.empty()) 86 TuneCPUString = CPUString; 87 88 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 89 initializeProperties(); 90 91 return *this; 92 } 93 94 void AArch64Subtarget::initializeProperties() { 95 // Initialize CPU specific properties. We should add a tablegen feature for 96 // this in the future so we can specify it together with the subtarget 97 // features. 98 switch (ARMProcFamily) { 99 case Others: 100 break; 101 case Carmel: 102 CacheLineSize = 64; 103 break; 104 case CortexA35: 105 case CortexA53: 106 case CortexA55: 107 PrefFunctionLogAlignment = 4; 108 PrefLoopLogAlignment = 4; 109 MaxBytesForLoopAlignment = 8; 110 break; 111 case CortexA57: 112 MaxInterleaveFactor = 4; 113 PrefFunctionLogAlignment = 4; 114 PrefLoopLogAlignment = 4; 115 MaxBytesForLoopAlignment = 8; 116 break; 117 case CortexA65: 118 PrefFunctionLogAlignment = 3; 119 break; 120 case CortexA72: 121 case CortexA73: 122 case CortexA75: 123 PrefFunctionLogAlignment = 4; 124 PrefLoopLogAlignment = 4; 125 MaxBytesForLoopAlignment = 8; 126 break; 127 case CortexA76: 128 case CortexA77: 129 case CortexA78: 130 case CortexA78C: 131 case CortexR82: 132 case CortexX1: 133 case CortexX1C: 134 PrefFunctionLogAlignment = 4; 135 PrefLoopLogAlignment = 5; 136 MaxBytesForLoopAlignment = 16; 137 break; 138 case CortexA510: 139 PrefFunctionLogAlignment = 4; 140 VScaleForTuning = 1; 141 PrefLoopLogAlignment = 4; 142 MaxBytesForLoopAlignment = 8; 143 break; 144 case CortexA710: 145 case CortexA715: 146 case CortexX2: 147 case CortexX3: 148 PrefFunctionLogAlignment = 4; 149 VScaleForTuning = 1; 150 PrefLoopLogAlignment = 5; 151 MaxBytesForLoopAlignment = 16; 152 break; 153 case A64FX: 154 CacheLineSize = 256; 155 PrefFunctionLogAlignment = 3; 156 PrefLoopLogAlignment = 2; 157 MaxInterleaveFactor = 4; 158 PrefetchDistance = 128; 159 MinPrefetchStride = 1024; 160 MaxPrefetchIterationsAhead = 4; 161 VScaleForTuning = 4; 162 break; 163 case AppleA7: 164 case AppleA10: 165 case AppleA11: 166 case AppleA12: 167 case AppleA13: 168 case AppleA14: 169 case AppleA15: 170 case AppleA16: 171 CacheLineSize = 64; 172 PrefetchDistance = 280; 173 MinPrefetchStride = 2048; 174 MaxPrefetchIterationsAhead = 3; 175 switch (ARMProcFamily) { 176 case AppleA14: 177 case AppleA15: 178 case AppleA16: 179 MaxInterleaveFactor = 4; 180 break; 181 default: 182 break; 183 } 184 break; 185 case ExynosM3: 186 MaxInterleaveFactor = 4; 187 MaxJumpTableSize = 20; 188 PrefFunctionLogAlignment = 5; 189 PrefLoopLogAlignment = 4; 190 break; 191 case Falkor: 192 MaxInterleaveFactor = 4; 193 // FIXME: remove this to enable 64-bit SLP if performance looks good. 194 MinVectorRegisterBitWidth = 128; 195 CacheLineSize = 128; 196 PrefetchDistance = 820; 197 MinPrefetchStride = 2048; 198 MaxPrefetchIterationsAhead = 8; 199 break; 200 case Kryo: 201 MaxInterleaveFactor = 4; 202 VectorInsertExtractBaseCost = 2; 203 CacheLineSize = 128; 204 PrefetchDistance = 740; 205 MinPrefetchStride = 1024; 206 MaxPrefetchIterationsAhead = 11; 207 // FIXME: remove this to enable 64-bit SLP if performance looks good. 208 MinVectorRegisterBitWidth = 128; 209 break; 210 case NeoverseE1: 211 PrefFunctionLogAlignment = 3; 212 break; 213 case NeoverseN1: 214 PrefFunctionLogAlignment = 4; 215 PrefLoopLogAlignment = 5; 216 MaxBytesForLoopAlignment = 16; 217 break; 218 case NeoverseN2: 219 case NeoverseV2: 220 PrefFunctionLogAlignment = 4; 221 PrefLoopLogAlignment = 5; 222 MaxBytesForLoopAlignment = 16; 223 VScaleForTuning = 1; 224 break; 225 case NeoverseV1: 226 PrefFunctionLogAlignment = 4; 227 PrefLoopLogAlignment = 5; 228 MaxBytesForLoopAlignment = 16; 229 VScaleForTuning = 2; 230 break; 231 case Neoverse512TVB: 232 PrefFunctionLogAlignment = 4; 233 VScaleForTuning = 1; 234 MaxInterleaveFactor = 4; 235 break; 236 case Saphira: 237 MaxInterleaveFactor = 4; 238 // FIXME: remove this to enable 64-bit SLP if performance looks good. 239 MinVectorRegisterBitWidth = 128; 240 break; 241 case ThunderX2T99: 242 CacheLineSize = 64; 243 PrefFunctionLogAlignment = 3; 244 PrefLoopLogAlignment = 2; 245 MaxInterleaveFactor = 4; 246 PrefetchDistance = 128; 247 MinPrefetchStride = 1024; 248 MaxPrefetchIterationsAhead = 4; 249 // FIXME: remove this to enable 64-bit SLP if performance looks good. 250 MinVectorRegisterBitWidth = 128; 251 break; 252 case ThunderX: 253 case ThunderXT88: 254 case ThunderXT81: 255 case ThunderXT83: 256 CacheLineSize = 128; 257 PrefFunctionLogAlignment = 3; 258 PrefLoopLogAlignment = 2; 259 // FIXME: remove this to enable 64-bit SLP if performance looks good. 260 MinVectorRegisterBitWidth = 128; 261 break; 262 case TSV110: 263 CacheLineSize = 64; 264 PrefFunctionLogAlignment = 4; 265 PrefLoopLogAlignment = 2; 266 break; 267 case ThunderX3T110: 268 CacheLineSize = 64; 269 PrefFunctionLogAlignment = 4; 270 PrefLoopLogAlignment = 2; 271 MaxInterleaveFactor = 4; 272 PrefetchDistance = 128; 273 MinPrefetchStride = 1024; 274 MaxPrefetchIterationsAhead = 4; 275 // FIXME: remove this to enable 64-bit SLP if performance looks good. 276 MinVectorRegisterBitWidth = 128; 277 break; 278 case Ampere1: 279 case Ampere1A: 280 CacheLineSize = 64; 281 PrefFunctionLogAlignment = 6; 282 PrefLoopLogAlignment = 6; 283 MaxInterleaveFactor = 4; 284 break; 285 } 286 } 287 288 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, 289 StringRef TuneCPU, StringRef FS, 290 const TargetMachine &TM, bool LittleEndian, 291 unsigned MinSVEVectorSizeInBitsOverride, 292 unsigned MaxSVEVectorSizeInBitsOverride, 293 bool StreamingSVEModeDisabled) 294 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 295 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 296 ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), 297 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 298 IsLittle(LittleEndian), 299 StreamingSVEModeDisabled(StreamingSVEModeDisabled), 300 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 301 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 302 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 303 TLInfo(TM, *this) { 304 if (AArch64::isX18ReservedByDefault(TT)) 305 ReserveXRegister.set(18); 306 307 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 308 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 309 Legalizer.reset(new AArch64LegalizerInfo(*this)); 310 311 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 312 313 // FIXME: At this point, we can't rely on Subtarget having RBI. 314 // It's awkward to mix passing RBI and the Subtarget; should we pass 315 // TII/TRI as well? 316 InstSelector.reset(createAArch64InstructionSelector( 317 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 318 319 RegBankInfo.reset(RBI); 320 321 auto TRI = getRegisterInfo(); 322 StringSet<> ReservedRegNames; 323 ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end()); 324 for (unsigned i = 0; i < 29; ++i) { 325 if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i))) 326 ReserveXRegisterForRA.set(i); 327 } 328 // X30 is named LR, so we can't use TRI->getName to check X30. 329 if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR")) 330 ReserveXRegisterForRA.set(30); 331 // X29 is named FP, so we can't use TRI->getName to check X29. 332 if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP")) 333 ReserveXRegisterForRA.set(29); 334 } 335 336 const CallLowering *AArch64Subtarget::getCallLowering() const { 337 return CallLoweringInfo.get(); 338 } 339 340 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 341 return InlineAsmLoweringInfo.get(); 342 } 343 344 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 345 return InstSelector.get(); 346 } 347 348 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 349 return Legalizer.get(); 350 } 351 352 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 353 return RegBankInfo.get(); 354 } 355 356 /// Find the target operand flags that describe how a global value should be 357 /// referenced for the current subtarget. 358 unsigned 359 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 360 const TargetMachine &TM) const { 361 // MachO large model always goes via a GOT, simply to get a single 8-byte 362 // absolute relocation on all global addresses. 363 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 364 return AArch64II::MO_GOT; 365 366 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 367 if (GV->hasDLLImportStorageClass()) { 368 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) 369 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX; 370 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 371 } 372 if (getTargetTriple().isOSWindows()) 373 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 374 return AArch64II::MO_GOT; 375 } 376 377 // The small code model's direct accesses use ADRP, which cannot 378 // necessarily produce the value 0 (if the code is above 4GB). 379 // Same for the tiny code model, where we have a pc relative LDR. 380 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 381 GV->hasExternalWeakLinkage()) 382 return AArch64II::MO_GOT; 383 384 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 385 // that their nominal addresses are tagged and outside of the code model. In 386 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 387 // tag if necessary based on MO_TAGGED. 388 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 389 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 390 391 return AArch64II::MO_NO_FLAG; 392 } 393 394 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 395 const GlobalValue *GV, const TargetMachine &TM) const { 396 // MachO large model always goes via a GOT, because we don't have the 397 // relocations available to do anything else.. 398 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 399 !GV->hasInternalLinkage()) 400 return AArch64II::MO_GOT; 401 402 // NonLazyBind goes via GOT unless we know it's available locally. 403 auto *F = dyn_cast<Function>(GV); 404 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 405 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 406 return AArch64II::MO_GOT; 407 408 if (getTargetTriple().isOSWindows()) { 409 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() && 410 GV->hasDLLImportStorageClass()) { 411 // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT, 412 // not MO_DLLIMPORTAUX. 413 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 414 } 415 416 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 417 return ClassifyGlobalReference(GV, TM); 418 } 419 420 return AArch64II::MO_NO_FLAG; 421 } 422 423 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 424 unsigned NumRegionInstrs) const { 425 // LNT run (at least on Cyclone) showed reasonably significant gains for 426 // bi-directional scheduling. 253.perlbmk. 427 Policy.OnlyTopDown = false; 428 Policy.OnlyBottomUp = false; 429 // Enabling or Disabling the latency heuristic is a close call: It seems to 430 // help nearly no benchmark on out-of-order architectures, on the other hand 431 // it regresses register pressure on a few benchmarking. 432 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 433 } 434 435 bool AArch64Subtarget::enableEarlyIfConversion() const { 436 return EnableEarlyIfConvert; 437 } 438 439 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 440 if (!UseAddressTopByteIgnored) 441 return false; 442 443 if (TargetTriple.isDriverKit()) 444 return true; 445 if (TargetTriple.isiOS()) { 446 return TargetTriple.getiOSVersion() >= VersionTuple(8); 447 } 448 449 return false; 450 } 451 452 std::unique_ptr<PBQPRAConstraint> 453 AArch64Subtarget::getCustomPBQPConstraints() const { 454 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 455 } 456 457 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 458 // We usually compute max call frame size after ISel. Do the computation now 459 // if the .mir file didn't specify it. Note that this will probably give you 460 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 461 // instructions, specify explicitly if you need it to be correct. 462 MachineFrameInfo &MFI = MF.getFrameInfo(); 463 if (!MFI.isMaxCallFrameSizeComputed()) 464 MFI.computeMaxCallFrameSize(MF); 465 } 466 467 bool AArch64Subtarget::useAA() const { return UseAA; } 468 469 bool AArch64Subtarget::forceStreamingCompatibleSVE() const { 470 if (ForceStreamingCompatibleSVE) { 471 assert(hasSVEorSME() && "Expected SVE to be available"); 472 return hasSVEorSME(); 473 } 474 return false; 475 } 476