1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/TargetParser/AArch64TargetParser.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "aarch64-subtarget" 32 33 #define GET_SUBTARGETINFO_CTOR 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #include "AArch64GenSubtargetInfo.inc" 36 37 static cl::opt<bool> 38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 39 "converter pass"), cl::init(true), cl::Hidden); 40 41 // If OS supports TBI, use this flag to enable it. 42 static cl::opt<bool> 43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 44 "an address is ignored"), cl::init(false), cl::Hidden); 45 46 static cl::opt<bool> 47 UseNonLazyBind("aarch64-enable-nonlazybind", 48 cl::desc("Call nonlazybind functions via direct GOT load"), 49 cl::init(false), cl::Hidden); 50 51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 52 cl::desc("Enable the use of AA during codegen.")); 53 54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( 55 "aarch64-insert-extract-base-cost", 56 cl::desc("Base cost of vector insert/extract element"), cl::Hidden); 57 58 // Reserve a list of X# registers, so they are unavailable for register 59 // allocator, but can still be used as ABI requests, such as passing arguments 60 // to function call. 61 static cl::list<std::string> 62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical " 63 "registers, so they can't be used by register allocator. " 64 "Should only be used for testing register allocator."), 65 cl::CommaSeparated, cl::Hidden); 66 67 static cl::opt<bool> ForceStreamingCompatibleSVE( 68 "force-streaming-compatible-sve", 69 cl::desc( 70 "Force the use of streaming-compatible SVE code for all functions"), 71 cl::Hidden); 72 73 static cl::opt<AArch64PAuth::AuthCheckMethod> 74 AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method", 75 cl::Hidden, 76 cl::desc("Override the variant of check applied " 77 "to authenticated LR during tail call"), 78 cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR)); 79 80 static cl::opt<unsigned> AArch64MinimumJumpTableEntries( 81 "aarch64-min-jump-table-entries", cl::init(13), cl::Hidden, 82 cl::desc("Set minimum number of entries to use a jump table on AArch64")); 83 84 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { 85 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) 86 return OverrideVectorInsertExtractBaseCost; 87 return VectorInsertExtractBaseCost; 88 } 89 90 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 91 StringRef FS, StringRef CPUString, StringRef TuneCPUString, 92 bool HasMinSize) { 93 // Determine default and user-specified characteristics 94 95 if (CPUString.empty()) 96 CPUString = "generic"; 97 98 if (TuneCPUString.empty()) 99 TuneCPUString = CPUString; 100 101 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 102 initializeProperties(HasMinSize); 103 104 return *this; 105 } 106 107 void AArch64Subtarget::initializeProperties(bool HasMinSize) { 108 // Initialize CPU specific properties. We should add a tablegen feature for 109 // this in the future so we can specify it together with the subtarget 110 // features. 111 switch (ARMProcFamily) { 112 case Others: 113 break; 114 case Carmel: 115 CacheLineSize = 64; 116 break; 117 case CortexA35: 118 case CortexA53: 119 case CortexA55: 120 PrefFunctionAlignment = Align(16); 121 PrefLoopAlignment = Align(16); 122 MaxBytesForLoopAlignment = 8; 123 break; 124 case CortexA57: 125 MaxInterleaveFactor = 4; 126 PrefFunctionAlignment = Align(16); 127 PrefLoopAlignment = Align(16); 128 MaxBytesForLoopAlignment = 8; 129 break; 130 case CortexA65: 131 PrefFunctionAlignment = Align(8); 132 break; 133 case CortexA72: 134 case CortexA73: 135 case CortexA75: 136 PrefFunctionAlignment = Align(16); 137 PrefLoopAlignment = Align(16); 138 MaxBytesForLoopAlignment = 8; 139 break; 140 case CortexA76: 141 case CortexA77: 142 case CortexA78: 143 case CortexA78C: 144 case CortexR82: 145 case CortexX1: 146 case CortexX1C: 147 PrefFunctionAlignment = Align(16); 148 PrefLoopAlignment = Align(32); 149 MaxBytesForLoopAlignment = 16; 150 break; 151 case CortexA510: 152 case CortexA520: 153 PrefFunctionAlignment = Align(16); 154 VScaleForTuning = 1; 155 PrefLoopAlignment = Align(16); 156 MaxBytesForLoopAlignment = 8; 157 break; 158 case CortexA710: 159 case CortexA715: 160 case CortexA720: 161 case CortexX2: 162 case CortexX3: 163 case CortexX4: 164 PrefFunctionAlignment = Align(16); 165 VScaleForTuning = 1; 166 PrefLoopAlignment = Align(32); 167 MaxBytesForLoopAlignment = 16; 168 break; 169 case A64FX: 170 CacheLineSize = 256; 171 PrefFunctionAlignment = Align(8); 172 PrefLoopAlignment = Align(4); 173 MaxInterleaveFactor = 4; 174 PrefetchDistance = 128; 175 MinPrefetchStride = 1024; 176 MaxPrefetchIterationsAhead = 4; 177 VScaleForTuning = 4; 178 break; 179 case AppleA7: 180 case AppleA10: 181 case AppleA11: 182 case AppleA12: 183 case AppleA13: 184 case AppleA14: 185 case AppleA15: 186 case AppleA16: 187 case AppleA17: 188 CacheLineSize = 64; 189 PrefetchDistance = 280; 190 MinPrefetchStride = 2048; 191 MaxPrefetchIterationsAhead = 3; 192 switch (ARMProcFamily) { 193 case AppleA14: 194 case AppleA15: 195 case AppleA16: 196 case AppleA17: 197 MaxInterleaveFactor = 4; 198 break; 199 default: 200 break; 201 } 202 break; 203 case ExynosM3: 204 MaxInterleaveFactor = 4; 205 MaxJumpTableSize = 20; 206 PrefFunctionAlignment = Align(32); 207 PrefLoopAlignment = Align(16); 208 break; 209 case Falkor: 210 MaxInterleaveFactor = 4; 211 // FIXME: remove this to enable 64-bit SLP if performance looks good. 212 MinVectorRegisterBitWidth = 128; 213 CacheLineSize = 128; 214 PrefetchDistance = 820; 215 MinPrefetchStride = 2048; 216 MaxPrefetchIterationsAhead = 8; 217 break; 218 case Kryo: 219 MaxInterleaveFactor = 4; 220 VectorInsertExtractBaseCost = 2; 221 CacheLineSize = 128; 222 PrefetchDistance = 740; 223 MinPrefetchStride = 1024; 224 MaxPrefetchIterationsAhead = 11; 225 // FIXME: remove this to enable 64-bit SLP if performance looks good. 226 MinVectorRegisterBitWidth = 128; 227 break; 228 case NeoverseE1: 229 PrefFunctionAlignment = Align(8); 230 break; 231 case NeoverseN1: 232 PrefFunctionAlignment = Align(16); 233 PrefLoopAlignment = Align(32); 234 MaxBytesForLoopAlignment = 16; 235 break; 236 case NeoverseN2: 237 case NeoverseV2: 238 PrefFunctionAlignment = Align(16); 239 PrefLoopAlignment = Align(32); 240 MaxBytesForLoopAlignment = 16; 241 VScaleForTuning = 1; 242 break; 243 case NeoverseV1: 244 PrefFunctionAlignment = Align(16); 245 PrefLoopAlignment = Align(32); 246 MaxBytesForLoopAlignment = 16; 247 VScaleForTuning = 2; 248 DefaultSVETFOpts = TailFoldingOpts::Simple; 249 break; 250 case Neoverse512TVB: 251 PrefFunctionAlignment = Align(16); 252 VScaleForTuning = 1; 253 MaxInterleaveFactor = 4; 254 break; 255 case Saphira: 256 MaxInterleaveFactor = 4; 257 // FIXME: remove this to enable 64-bit SLP if performance looks good. 258 MinVectorRegisterBitWidth = 128; 259 break; 260 case ThunderX2T99: 261 CacheLineSize = 64; 262 PrefFunctionAlignment = Align(8); 263 PrefLoopAlignment = Align(4); 264 MaxInterleaveFactor = 4; 265 PrefetchDistance = 128; 266 MinPrefetchStride = 1024; 267 MaxPrefetchIterationsAhead = 4; 268 // FIXME: remove this to enable 64-bit SLP if performance looks good. 269 MinVectorRegisterBitWidth = 128; 270 break; 271 case ThunderX: 272 case ThunderXT88: 273 case ThunderXT81: 274 case ThunderXT83: 275 CacheLineSize = 128; 276 PrefFunctionAlignment = Align(8); 277 PrefLoopAlignment = Align(4); 278 // FIXME: remove this to enable 64-bit SLP if performance looks good. 279 MinVectorRegisterBitWidth = 128; 280 break; 281 case TSV110: 282 CacheLineSize = 64; 283 PrefFunctionAlignment = Align(16); 284 PrefLoopAlignment = Align(4); 285 break; 286 case ThunderX3T110: 287 CacheLineSize = 64; 288 PrefFunctionAlignment = Align(16); 289 PrefLoopAlignment = Align(4); 290 MaxInterleaveFactor = 4; 291 PrefetchDistance = 128; 292 MinPrefetchStride = 1024; 293 MaxPrefetchIterationsAhead = 4; 294 // FIXME: remove this to enable 64-bit SLP if performance looks good. 295 MinVectorRegisterBitWidth = 128; 296 break; 297 case Ampere1: 298 case Ampere1A: 299 case Ampere1B: 300 CacheLineSize = 64; 301 PrefFunctionAlignment = Align(64); 302 PrefLoopAlignment = Align(64); 303 MaxInterleaveFactor = 4; 304 break; 305 } 306 307 if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize) 308 MinimumJumpTableEntries = AArch64MinimumJumpTableEntries; 309 } 310 311 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, 312 StringRef TuneCPU, StringRef FS, 313 const TargetMachine &TM, bool LittleEndian, 314 unsigned MinSVEVectorSizeInBitsOverride, 315 unsigned MaxSVEVectorSizeInBitsOverride, 316 bool StreamingSVEMode, 317 bool StreamingCompatibleSVEMode, 318 bool HasMinSize) 319 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 320 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 321 ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), 322 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 323 IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode), 324 StreamingCompatibleSVEMode(StreamingCompatibleSVEMode), 325 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 326 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 327 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)), 328 TLInfo(TM, *this) { 329 if (AArch64::isX18ReservedByDefault(TT)) 330 ReserveXRegister.set(18); 331 332 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 333 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 334 Legalizer.reset(new AArch64LegalizerInfo(*this)); 335 336 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 337 338 // FIXME: At this point, we can't rely on Subtarget having RBI. 339 // It's awkward to mix passing RBI and the Subtarget; should we pass 340 // TII/TRI as well? 341 InstSelector.reset(createAArch64InstructionSelector( 342 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 343 344 RegBankInfo.reset(RBI); 345 346 auto TRI = getRegisterInfo(); 347 StringSet<> ReservedRegNames; 348 ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end()); 349 for (unsigned i = 0; i < 29; ++i) { 350 if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i))) 351 ReserveXRegisterForRA.set(i); 352 } 353 // X30 is named LR, so we can't use TRI->getName to check X30. 354 if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR")) 355 ReserveXRegisterForRA.set(30); 356 // X29 is named FP, so we can't use TRI->getName to check X29. 357 if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP")) 358 ReserveXRegisterForRA.set(29); 359 360 AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM)); 361 } 362 363 const CallLowering *AArch64Subtarget::getCallLowering() const { 364 return CallLoweringInfo.get(); 365 } 366 367 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 368 return InlineAsmLoweringInfo.get(); 369 } 370 371 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 372 return InstSelector.get(); 373 } 374 375 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 376 return Legalizer.get(); 377 } 378 379 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 380 return RegBankInfo.get(); 381 } 382 383 /// Find the target operand flags that describe how a global value should be 384 /// referenced for the current subtarget. 385 unsigned 386 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 387 const TargetMachine &TM) const { 388 // MachO large model always goes via a GOT, simply to get a single 8-byte 389 // absolute relocation on all global addresses. 390 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 391 return AArch64II::MO_GOT; 392 393 // All globals dynamically protected by MTE must have their address tags 394 // synthesized. This is done by having the loader stash the tag in the GOT 395 // entry. Force all tagged globals (even ones with internal linkage) through 396 // the GOT. 397 if (GV->isTagged()) 398 return AArch64II::MO_GOT; 399 400 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 401 if (GV->hasDLLImportStorageClass()) { 402 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 403 } 404 if (getTargetTriple().isOSWindows()) 405 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 406 return AArch64II::MO_GOT; 407 } 408 409 // The small code model's direct accesses use ADRP, which cannot 410 // necessarily produce the value 0 (if the code is above 4GB). 411 // Same for the tiny code model, where we have a pc relative LDR. 412 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 413 GV->hasExternalWeakLinkage()) 414 return AArch64II::MO_GOT; 415 416 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 417 // that their nominal addresses are tagged and outside of the code model. In 418 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 419 // tag if necessary based on MO_TAGGED. 420 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 421 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 422 423 return AArch64II::MO_NO_FLAG; 424 } 425 426 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 427 const GlobalValue *GV, const TargetMachine &TM) const { 428 // MachO large model always goes via a GOT, because we don't have the 429 // relocations available to do anything else.. 430 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 431 !GV->hasInternalLinkage()) 432 return AArch64II::MO_GOT; 433 434 // NonLazyBind goes via GOT unless we know it's available locally. 435 auto *F = dyn_cast<Function>(GV); 436 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 437 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 438 return AArch64II::MO_GOT; 439 440 if (getTargetTriple().isOSWindows()) { 441 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) { 442 if (GV->hasDLLImportStorageClass()) { 443 // On Arm64EC, if we're calling a symbol from the import table 444 // directly, use MO_ARM64EC_CALLMANGLE. 445 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT | 446 AArch64II::MO_ARM64EC_CALLMANGLE; 447 } 448 if (GV->hasExternalLinkage()) { 449 // If we're calling a symbol directly, use the mangled form in the 450 // call instruction. 451 return AArch64II::MO_ARM64EC_CALLMANGLE; 452 } 453 } 454 455 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 456 return ClassifyGlobalReference(GV, TM); 457 } 458 459 return AArch64II::MO_NO_FLAG; 460 } 461 462 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 463 unsigned NumRegionInstrs) const { 464 // LNT run (at least on Cyclone) showed reasonably significant gains for 465 // bi-directional scheduling. 253.perlbmk. 466 Policy.OnlyTopDown = false; 467 Policy.OnlyBottomUp = false; 468 // Enabling or Disabling the latency heuristic is a close call: It seems to 469 // help nearly no benchmark on out-of-order architectures, on the other hand 470 // it regresses register pressure on a few benchmarking. 471 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 472 } 473 474 bool AArch64Subtarget::enableEarlyIfConversion() const { 475 return EnableEarlyIfConvert; 476 } 477 478 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 479 if (!UseAddressTopByteIgnored) 480 return false; 481 482 if (TargetTriple.isDriverKit()) 483 return true; 484 if (TargetTriple.isiOS()) { 485 return TargetTriple.getiOSVersion() >= VersionTuple(8); 486 } 487 488 return false; 489 } 490 491 std::unique_ptr<PBQPRAConstraint> 492 AArch64Subtarget::getCustomPBQPConstraints() const { 493 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 494 } 495 496 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 497 // We usually compute max call frame size after ISel. Do the computation now 498 // if the .mir file didn't specify it. Note that this will probably give you 499 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 500 // instructions, specify explicitly if you need it to be correct. 501 MachineFrameInfo &MFI = MF.getFrameInfo(); 502 if (!MFI.isMaxCallFrameSizeComputed()) 503 MFI.computeMaxCallFrameSize(MF); 504 } 505 506 bool AArch64Subtarget::useAA() const { return UseAA; } 507 508 bool AArch64Subtarget::isStreamingCompatible() const { 509 return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE; 510 } 511 512 bool AArch64Subtarget::isNeonAvailable() const { 513 return hasNEON() && 514 (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); 515 } 516 517 bool AArch64Subtarget::isSVEAvailable() const { 518 return hasSVE() && 519 (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); 520 } 521 522 // If return address signing is enabled, tail calls are emitted as follows: 523 // 524 // ``` 525 // <authenticate LR> 526 // <check LR> 527 // TCRETURN ; the callee may sign and spill the LR in its prologue 528 // ``` 529 // 530 // LR may require explicit checking because if FEAT_FPAC is not implemented 531 // and LR was tampered with, then `<authenticate LR>` will not generate an 532 // exception on its own. Later, if the callee spills the signed LR value and 533 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces 534 // the higher bits of LR thus hiding the authentication failure. 535 AArch64PAuth::AuthCheckMethod 536 AArch64Subtarget::getAuthenticatedLRCheckMethod() const { 537 if (AuthenticatedLRCheckMethod.getNumOccurrences()) 538 return AuthenticatedLRCheckMethod; 539 540 // At now, use None by default because checks may introduce an unexpected 541 // performance regression or incompatibility with execute-only mappings. 542 return AArch64PAuth::AuthCheckMethod::None; 543 } 544