1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/TargetParser/AArch64TargetParser.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "aarch64-subtarget" 32 33 #define GET_SUBTARGETINFO_CTOR 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #include "AArch64GenSubtargetInfo.inc" 36 37 static cl::opt<bool> 38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 39 "converter pass"), cl::init(true), cl::Hidden); 40 41 // If OS supports TBI, use this flag to enable it. 42 static cl::opt<bool> 43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 44 "an address is ignored"), cl::init(false), cl::Hidden); 45 46 static cl::opt<bool> 47 UseNonLazyBind("aarch64-enable-nonlazybind", 48 cl::desc("Call nonlazybind functions via direct GOT load"), 49 cl::init(false), cl::Hidden); 50 51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 52 cl::desc("Enable the use of AA during codegen.")); 53 54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( 55 "aarch64-insert-extract-base-cost", 56 cl::desc("Base cost of vector insert/extract element"), cl::Hidden); 57 58 // Reserve a list of X# registers, so they are unavailable for register 59 // allocator, but can still be used as ABI requests, such as passing arguments 60 // to function call. 61 static cl::list<std::string> 62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical " 63 "registers, so they can't be used by register allocator. " 64 "Should only be used for testing register allocator."), 65 cl::CommaSeparated, cl::Hidden); 66 67 static cl::opt<bool> ForceStreamingCompatibleSVE( 68 "force-streaming-compatible-sve", 69 cl::desc( 70 "Force the use of streaming-compatible SVE code for all functions"), 71 cl::Hidden); 72 73 static cl::opt<AArch64PAuth::AuthCheckMethod> 74 AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method", 75 cl::Hidden, 76 cl::desc("Override the variant of check applied " 77 "to authenticated LR during tail call"), 78 cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR)); 79 80 static cl::opt<unsigned> AArch64MinimumJumpTableEntries( 81 "aarch64-min-jump-table-entries", cl::init(13), cl::Hidden, 82 cl::desc("Set minimum number of entries to use a jump table on AArch64")); 83 84 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { 85 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) 86 return OverrideVectorInsertExtractBaseCost; 87 return VectorInsertExtractBaseCost; 88 } 89 90 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 91 StringRef FS, StringRef CPUString, StringRef TuneCPUString, 92 bool HasMinSize) { 93 // Determine default and user-specified characteristics 94 95 if (CPUString.empty()) 96 CPUString = "generic"; 97 98 if (TuneCPUString.empty()) 99 TuneCPUString = CPUString; 100 101 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 102 initializeProperties(HasMinSize); 103 104 return *this; 105 } 106 107 void AArch64Subtarget::initializeProperties(bool HasMinSize) { 108 // Initialize CPU specific properties. We should add a tablegen feature for 109 // this in the future so we can specify it together with the subtarget 110 // features. 111 switch (ARMProcFamily) { 112 case Others: 113 break; 114 case Carmel: 115 CacheLineSize = 64; 116 break; 117 case CortexA35: 118 case CortexA53: 119 case CortexA55: 120 PrefFunctionAlignment = Align(16); 121 PrefLoopAlignment = Align(16); 122 MaxBytesForLoopAlignment = 8; 123 break; 124 case CortexA57: 125 MaxInterleaveFactor = 4; 126 PrefFunctionAlignment = Align(16); 127 PrefLoopAlignment = Align(16); 128 MaxBytesForLoopAlignment = 8; 129 break; 130 case CortexA65: 131 PrefFunctionAlignment = Align(8); 132 break; 133 case CortexA72: 134 case CortexA73: 135 case CortexA75: 136 PrefFunctionAlignment = Align(16); 137 PrefLoopAlignment = Align(16); 138 MaxBytesForLoopAlignment = 8; 139 break; 140 case CortexA76: 141 case CortexA77: 142 case CortexA78: 143 case CortexA78C: 144 case CortexR82: 145 case CortexX1: 146 case CortexX1C: 147 PrefFunctionAlignment = Align(16); 148 PrefLoopAlignment = Align(32); 149 MaxBytesForLoopAlignment = 16; 150 break; 151 case CortexA510: 152 case CortexA520: 153 PrefFunctionAlignment = Align(16); 154 VScaleForTuning = 1; 155 PrefLoopAlignment = Align(16); 156 MaxBytesForLoopAlignment = 8; 157 break; 158 case CortexA710: 159 case CortexA715: 160 case CortexA720: 161 case CortexX2: 162 case CortexX3: 163 case CortexX4: 164 PrefFunctionAlignment = Align(16); 165 VScaleForTuning = 1; 166 PrefLoopAlignment = Align(32); 167 MaxBytesForLoopAlignment = 16; 168 break; 169 case A64FX: 170 CacheLineSize = 256; 171 PrefFunctionAlignment = Align(8); 172 PrefLoopAlignment = Align(4); 173 MaxInterleaveFactor = 4; 174 PrefetchDistance = 128; 175 MinPrefetchStride = 1024; 176 MaxPrefetchIterationsAhead = 4; 177 VScaleForTuning = 4; 178 break; 179 case AppleA7: 180 case AppleA10: 181 case AppleA11: 182 case AppleA12: 183 case AppleA13: 184 case AppleA14: 185 case AppleA15: 186 case AppleA16: 187 case AppleA17: 188 CacheLineSize = 64; 189 PrefetchDistance = 280; 190 MinPrefetchStride = 2048; 191 MaxPrefetchIterationsAhead = 3; 192 switch (ARMProcFamily) { 193 case AppleA14: 194 case AppleA15: 195 case AppleA16: 196 case AppleA17: 197 MaxInterleaveFactor = 4; 198 break; 199 default: 200 break; 201 } 202 break; 203 case ExynosM3: 204 MaxInterleaveFactor = 4; 205 MaxJumpTableSize = 20; 206 PrefFunctionAlignment = Align(32); 207 PrefLoopAlignment = Align(16); 208 break; 209 case Falkor: 210 MaxInterleaveFactor = 4; 211 // FIXME: remove this to enable 64-bit SLP if performance looks good. 212 MinVectorRegisterBitWidth = 128; 213 CacheLineSize = 128; 214 PrefetchDistance = 820; 215 MinPrefetchStride = 2048; 216 MaxPrefetchIterationsAhead = 8; 217 break; 218 case Kryo: 219 MaxInterleaveFactor = 4; 220 VectorInsertExtractBaseCost = 2; 221 CacheLineSize = 128; 222 PrefetchDistance = 740; 223 MinPrefetchStride = 1024; 224 MaxPrefetchIterationsAhead = 11; 225 // FIXME: remove this to enable 64-bit SLP if performance looks good. 226 MinVectorRegisterBitWidth = 128; 227 break; 228 case NeoverseE1: 229 PrefFunctionAlignment = Align(8); 230 break; 231 case NeoverseN1: 232 PrefFunctionAlignment = Align(16); 233 PrefLoopAlignment = Align(32); 234 MaxBytesForLoopAlignment = 16; 235 break; 236 case NeoverseN2: 237 case NeoverseV2: 238 PrefFunctionAlignment = Align(16); 239 PrefLoopAlignment = Align(32); 240 MaxBytesForLoopAlignment = 16; 241 VScaleForTuning = 1; 242 break; 243 case NeoverseV1: 244 PrefFunctionAlignment = Align(16); 245 PrefLoopAlignment = Align(32); 246 MaxBytesForLoopAlignment = 16; 247 VScaleForTuning = 2; 248 DefaultSVETFOpts = TailFoldingOpts::Simple; 249 break; 250 case Neoverse512TVB: 251 PrefFunctionAlignment = Align(16); 252 VScaleForTuning = 1; 253 MaxInterleaveFactor = 4; 254 break; 255 case Saphira: 256 MaxInterleaveFactor = 4; 257 // FIXME: remove this to enable 64-bit SLP if performance looks good. 258 MinVectorRegisterBitWidth = 128; 259 break; 260 case ThunderX2T99: 261 CacheLineSize = 64; 262 PrefFunctionAlignment = Align(8); 263 PrefLoopAlignment = Align(4); 264 MaxInterleaveFactor = 4; 265 PrefetchDistance = 128; 266 MinPrefetchStride = 1024; 267 MaxPrefetchIterationsAhead = 4; 268 // FIXME: remove this to enable 64-bit SLP if performance looks good. 269 MinVectorRegisterBitWidth = 128; 270 break; 271 case ThunderX: 272 case ThunderXT88: 273 case ThunderXT81: 274 case ThunderXT83: 275 CacheLineSize = 128; 276 PrefFunctionAlignment = Align(8); 277 PrefLoopAlignment = Align(4); 278 // FIXME: remove this to enable 64-bit SLP if performance looks good. 279 MinVectorRegisterBitWidth = 128; 280 break; 281 case TSV110: 282 CacheLineSize = 64; 283 PrefFunctionAlignment = Align(16); 284 PrefLoopAlignment = Align(4); 285 break; 286 case ThunderX3T110: 287 CacheLineSize = 64; 288 PrefFunctionAlignment = Align(16); 289 PrefLoopAlignment = Align(4); 290 MaxInterleaveFactor = 4; 291 PrefetchDistance = 128; 292 MinPrefetchStride = 1024; 293 MaxPrefetchIterationsAhead = 4; 294 // FIXME: remove this to enable 64-bit SLP if performance looks good. 295 MinVectorRegisterBitWidth = 128; 296 break; 297 case Ampere1: 298 case Ampere1A: 299 CacheLineSize = 64; 300 PrefFunctionAlignment = Align(64); 301 PrefLoopAlignment = Align(64); 302 MaxInterleaveFactor = 4; 303 break; 304 } 305 306 if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize) 307 MinimumJumpTableEntries = AArch64MinimumJumpTableEntries; 308 } 309 310 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, 311 StringRef TuneCPU, StringRef FS, 312 const TargetMachine &TM, bool LittleEndian, 313 unsigned MinSVEVectorSizeInBitsOverride, 314 unsigned MaxSVEVectorSizeInBitsOverride, 315 bool StreamingSVEMode, 316 bool StreamingCompatibleSVEMode, 317 bool HasMinSize) 318 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 319 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 320 ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), 321 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 322 IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode), 323 StreamingCompatibleSVEMode(StreamingCompatibleSVEMode), 324 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 325 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 326 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)), 327 TLInfo(TM, *this) { 328 if (AArch64::isX18ReservedByDefault(TT)) 329 ReserveXRegister.set(18); 330 331 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 332 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 333 Legalizer.reset(new AArch64LegalizerInfo(*this)); 334 335 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 336 337 // FIXME: At this point, we can't rely on Subtarget having RBI. 338 // It's awkward to mix passing RBI and the Subtarget; should we pass 339 // TII/TRI as well? 340 InstSelector.reset(createAArch64InstructionSelector( 341 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 342 343 RegBankInfo.reset(RBI); 344 345 auto TRI = getRegisterInfo(); 346 StringSet<> ReservedRegNames; 347 ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end()); 348 for (unsigned i = 0; i < 29; ++i) { 349 if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i))) 350 ReserveXRegisterForRA.set(i); 351 } 352 // X30 is named LR, so we can't use TRI->getName to check X30. 353 if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR")) 354 ReserveXRegisterForRA.set(30); 355 // X29 is named FP, so we can't use TRI->getName to check X29. 356 if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP")) 357 ReserveXRegisterForRA.set(29); 358 359 AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM)); 360 } 361 362 const CallLowering *AArch64Subtarget::getCallLowering() const { 363 return CallLoweringInfo.get(); 364 } 365 366 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 367 return InlineAsmLoweringInfo.get(); 368 } 369 370 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 371 return InstSelector.get(); 372 } 373 374 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 375 return Legalizer.get(); 376 } 377 378 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 379 return RegBankInfo.get(); 380 } 381 382 /// Find the target operand flags that describe how a global value should be 383 /// referenced for the current subtarget. 384 unsigned 385 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 386 const TargetMachine &TM) const { 387 // MachO large model always goes via a GOT, simply to get a single 8-byte 388 // absolute relocation on all global addresses. 389 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 390 return AArch64II::MO_GOT; 391 392 // All globals dynamically protected by MTE must have their address tags 393 // synthesized. This is done by having the loader stash the tag in the GOT 394 // entry. Force all tagged globals (even ones with internal linkage) through 395 // the GOT. 396 if (GV->isTagged()) 397 return AArch64II::MO_GOT; 398 399 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 400 if (GV->hasDLLImportStorageClass()) { 401 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) 402 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX; 403 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 404 } 405 if (getTargetTriple().isOSWindows()) 406 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 407 return AArch64II::MO_GOT; 408 } 409 410 // The small code model's direct accesses use ADRP, which cannot 411 // necessarily produce the value 0 (if the code is above 4GB). 412 // Same for the tiny code model, where we have a pc relative LDR. 413 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 414 GV->hasExternalWeakLinkage()) 415 return AArch64II::MO_GOT; 416 417 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 418 // that their nominal addresses are tagged and outside of the code model. In 419 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 420 // tag if necessary based on MO_TAGGED. 421 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 422 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 423 424 return AArch64II::MO_NO_FLAG; 425 } 426 427 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 428 const GlobalValue *GV, const TargetMachine &TM) const { 429 // MachO large model always goes via a GOT, because we don't have the 430 // relocations available to do anything else.. 431 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 432 !GV->hasInternalLinkage()) 433 return AArch64II::MO_GOT; 434 435 // NonLazyBind goes via GOT unless we know it's available locally. 436 auto *F = dyn_cast<Function>(GV); 437 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 438 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 439 return AArch64II::MO_GOT; 440 441 if (getTargetTriple().isOSWindows()) { 442 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() && 443 GV->hasDLLImportStorageClass()) { 444 // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT, 445 // not MO_DLLIMPORTAUX. 446 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 447 } 448 449 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 450 return ClassifyGlobalReference(GV, TM); 451 } 452 453 return AArch64II::MO_NO_FLAG; 454 } 455 456 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 457 unsigned NumRegionInstrs) const { 458 // LNT run (at least on Cyclone) showed reasonably significant gains for 459 // bi-directional scheduling. 253.perlbmk. 460 Policy.OnlyTopDown = false; 461 Policy.OnlyBottomUp = false; 462 // Enabling or Disabling the latency heuristic is a close call: It seems to 463 // help nearly no benchmark on out-of-order architectures, on the other hand 464 // it regresses register pressure on a few benchmarking. 465 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 466 } 467 468 bool AArch64Subtarget::enableEarlyIfConversion() const { 469 return EnableEarlyIfConvert; 470 } 471 472 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 473 if (!UseAddressTopByteIgnored) 474 return false; 475 476 if (TargetTriple.isDriverKit()) 477 return true; 478 if (TargetTriple.isiOS()) { 479 return TargetTriple.getiOSVersion() >= VersionTuple(8); 480 } 481 482 return false; 483 } 484 485 std::unique_ptr<PBQPRAConstraint> 486 AArch64Subtarget::getCustomPBQPConstraints() const { 487 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 488 } 489 490 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 491 // We usually compute max call frame size after ISel. Do the computation now 492 // if the .mir file didn't specify it. Note that this will probably give you 493 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 494 // instructions, specify explicitly if you need it to be correct. 495 MachineFrameInfo &MFI = MF.getFrameInfo(); 496 if (!MFI.isMaxCallFrameSizeComputed()) 497 MFI.computeMaxCallFrameSize(MF); 498 } 499 500 bool AArch64Subtarget::useAA() const { return UseAA; } 501 502 bool AArch64Subtarget::isStreamingCompatible() const { 503 return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE; 504 } 505 506 bool AArch64Subtarget::isNeonAvailable() const { 507 return hasNEON() && 508 (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); 509 } 510 511 bool AArch64Subtarget::isSVEAvailable() const { 512 return hasSVE() && 513 (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); 514 } 515 516 // If return address signing is enabled, tail calls are emitted as follows: 517 // 518 // ``` 519 // <authenticate LR> 520 // <check LR> 521 // TCRETURN ; the callee may sign and spill the LR in its prologue 522 // ``` 523 // 524 // LR may require explicit checking because if FEAT_FPAC is not implemented 525 // and LR was tampered with, then `<authenticate LR>` will not generate an 526 // exception on its own. Later, if the callee spills the signed LR value and 527 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces 528 // the higher bits of LR thus hiding the authentication failure. 529 AArch64PAuth::AuthCheckMethod 530 AArch64Subtarget::getAuthenticatedLRCheckMethod() const { 531 if (AuthenticatedLRCheckMethod.getNumOccurrences()) 532 return AuthenticatedLRCheckMethod; 533 534 // At now, use None by default because checks may introduce an unexpected 535 // performance regression or incompatibility with execute-only mappings. 536 return AArch64PAuth::AuthCheckMethod::None; 537 } 538