1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/TargetParser/AArch64TargetParser.h" 28 29 using namespace llvm; 30 31 #define DEBUG_TYPE "aarch64-subtarget" 32 33 #define GET_SUBTARGETINFO_CTOR 34 #define GET_SUBTARGETINFO_TARGET_DESC 35 #include "AArch64GenSubtargetInfo.inc" 36 37 static cl::opt<bool> 38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 39 "converter pass"), cl::init(true), cl::Hidden); 40 41 // If OS supports TBI, use this flag to enable it. 42 static cl::opt<bool> 43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 44 "an address is ignored"), cl::init(false), cl::Hidden); 45 46 static cl::opt<bool> 47 UseNonLazyBind("aarch64-enable-nonlazybind", 48 cl::desc("Call nonlazybind functions via direct GOT load"), 49 cl::init(false), cl::Hidden); 50 51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 52 cl::desc("Enable the use of AA during codegen.")); 53 54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( 55 "aarch64-insert-extract-base-cost", 56 cl::desc("Base cost of vector insert/extract element"), cl::Hidden); 57 58 // Reserve a list of X# registers, so they are unavailable for register 59 // allocator, but can still be used as ABI requests, such as passing arguments 60 // to function call. 61 static cl::list<std::string> 62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical " 63 "registers, so they can't be used by register allocator. " 64 "Should only be used for testing register allocator."), 65 cl::CommaSeparated, cl::Hidden); 66 67 static cl::opt<bool> ForceStreamingCompatibleSVE( 68 "force-streaming-compatible-sve", 69 cl::desc( 70 "Force the use of streaming-compatible SVE code for all functions"), 71 cl::Hidden); 72 73 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { 74 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) 75 return OverrideVectorInsertExtractBaseCost; 76 return VectorInsertExtractBaseCost; 77 } 78 79 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 80 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 81 // Determine default and user-specified characteristics 82 83 if (CPUString.empty()) 84 CPUString = "generic"; 85 86 if (TuneCPUString.empty()) 87 TuneCPUString = CPUString; 88 89 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 90 initializeProperties(); 91 92 return *this; 93 } 94 95 void AArch64Subtarget::initializeProperties() { 96 // Initialize CPU specific properties. We should add a tablegen feature for 97 // this in the future so we can specify it together with the subtarget 98 // features. 99 switch (ARMProcFamily) { 100 case Others: 101 break; 102 case Carmel: 103 CacheLineSize = 64; 104 break; 105 case CortexA35: 106 case CortexA53: 107 case CortexA55: 108 PrefFunctionAlignment = Align(16); 109 PrefLoopAlignment = Align(16); 110 MaxBytesForLoopAlignment = 8; 111 break; 112 case CortexA57: 113 MaxInterleaveFactor = 4; 114 PrefFunctionAlignment = Align(16); 115 PrefLoopAlignment = Align(16); 116 MaxBytesForLoopAlignment = 8; 117 break; 118 case CortexA65: 119 PrefFunctionAlignment = Align(8); 120 break; 121 case CortexA72: 122 case CortexA73: 123 case CortexA75: 124 PrefFunctionAlignment = Align(16); 125 PrefLoopAlignment = Align(16); 126 MaxBytesForLoopAlignment = 8; 127 break; 128 case CortexA76: 129 case CortexA77: 130 case CortexA78: 131 case CortexA78C: 132 case CortexR82: 133 case CortexX1: 134 case CortexX1C: 135 PrefFunctionAlignment = Align(16); 136 PrefLoopAlignment = Align(32); 137 MaxBytesForLoopAlignment = 16; 138 break; 139 case CortexA510: 140 PrefFunctionAlignment = Align(16); 141 VScaleForTuning = 1; 142 PrefLoopAlignment = Align(16); 143 MaxBytesForLoopAlignment = 8; 144 break; 145 case CortexA710: 146 case CortexA715: 147 case CortexX2: 148 case CortexX3: 149 PrefFunctionAlignment = Align(16); 150 VScaleForTuning = 1; 151 PrefLoopAlignment = Align(32); 152 MaxBytesForLoopAlignment = 16; 153 break; 154 case A64FX: 155 CacheLineSize = 256; 156 PrefFunctionAlignment = Align(8); 157 PrefLoopAlignment = Align(4); 158 MaxInterleaveFactor = 4; 159 PrefetchDistance = 128; 160 MinPrefetchStride = 1024; 161 MaxPrefetchIterationsAhead = 4; 162 VScaleForTuning = 4; 163 break; 164 case AppleA7: 165 case AppleA10: 166 case AppleA11: 167 case AppleA12: 168 case AppleA13: 169 case AppleA14: 170 case AppleA15: 171 case AppleA16: 172 CacheLineSize = 64; 173 PrefetchDistance = 280; 174 MinPrefetchStride = 2048; 175 MaxPrefetchIterationsAhead = 3; 176 switch (ARMProcFamily) { 177 case AppleA14: 178 case AppleA15: 179 case AppleA16: 180 MaxInterleaveFactor = 4; 181 break; 182 default: 183 break; 184 } 185 break; 186 case ExynosM3: 187 MaxInterleaveFactor = 4; 188 MaxJumpTableSize = 20; 189 PrefFunctionAlignment = Align(32); 190 PrefLoopAlignment = Align(16); 191 break; 192 case Falkor: 193 MaxInterleaveFactor = 4; 194 // FIXME: remove this to enable 64-bit SLP if performance looks good. 195 MinVectorRegisterBitWidth = 128; 196 CacheLineSize = 128; 197 PrefetchDistance = 820; 198 MinPrefetchStride = 2048; 199 MaxPrefetchIterationsAhead = 8; 200 break; 201 case Kryo: 202 MaxInterleaveFactor = 4; 203 VectorInsertExtractBaseCost = 2; 204 CacheLineSize = 128; 205 PrefetchDistance = 740; 206 MinPrefetchStride = 1024; 207 MaxPrefetchIterationsAhead = 11; 208 // FIXME: remove this to enable 64-bit SLP if performance looks good. 209 MinVectorRegisterBitWidth = 128; 210 break; 211 case NeoverseE1: 212 PrefFunctionAlignment = Align(8); 213 break; 214 case NeoverseN1: 215 PrefFunctionAlignment = Align(16); 216 PrefLoopAlignment = Align(32); 217 MaxBytesForLoopAlignment = 16; 218 break; 219 case NeoverseN2: 220 case NeoverseV2: 221 PrefFunctionAlignment = Align(16); 222 PrefLoopAlignment = Align(32); 223 MaxBytesForLoopAlignment = 16; 224 VScaleForTuning = 1; 225 break; 226 case NeoverseV1: 227 PrefFunctionAlignment = Align(16); 228 PrefLoopAlignment = Align(32); 229 MaxBytesForLoopAlignment = 16; 230 VScaleForTuning = 2; 231 DefaultSVETFOpts = TailFoldingOpts::Simple; 232 break; 233 case Neoverse512TVB: 234 PrefFunctionAlignment = Align(16); 235 VScaleForTuning = 1; 236 MaxInterleaveFactor = 4; 237 break; 238 case Saphira: 239 MaxInterleaveFactor = 4; 240 // FIXME: remove this to enable 64-bit SLP if performance looks good. 241 MinVectorRegisterBitWidth = 128; 242 break; 243 case ThunderX2T99: 244 CacheLineSize = 64; 245 PrefFunctionAlignment = Align(8); 246 PrefLoopAlignment = Align(4); 247 MaxInterleaveFactor = 4; 248 PrefetchDistance = 128; 249 MinPrefetchStride = 1024; 250 MaxPrefetchIterationsAhead = 4; 251 // FIXME: remove this to enable 64-bit SLP if performance looks good. 252 MinVectorRegisterBitWidth = 128; 253 break; 254 case ThunderX: 255 case ThunderXT88: 256 case ThunderXT81: 257 case ThunderXT83: 258 CacheLineSize = 128; 259 PrefFunctionAlignment = Align(8); 260 PrefLoopAlignment = Align(4); 261 // FIXME: remove this to enable 64-bit SLP if performance looks good. 262 MinVectorRegisterBitWidth = 128; 263 break; 264 case TSV110: 265 CacheLineSize = 64; 266 PrefFunctionAlignment = Align(16); 267 PrefLoopAlignment = Align(4); 268 break; 269 case ThunderX3T110: 270 CacheLineSize = 64; 271 PrefFunctionAlignment = Align(16); 272 PrefLoopAlignment = Align(4); 273 MaxInterleaveFactor = 4; 274 PrefetchDistance = 128; 275 MinPrefetchStride = 1024; 276 MaxPrefetchIterationsAhead = 4; 277 // FIXME: remove this to enable 64-bit SLP if performance looks good. 278 MinVectorRegisterBitWidth = 128; 279 break; 280 case Ampere1: 281 case Ampere1A: 282 CacheLineSize = 64; 283 PrefFunctionAlignment = Align(64); 284 PrefLoopAlignment = Align(64); 285 MaxInterleaveFactor = 4; 286 break; 287 } 288 } 289 290 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU, 291 StringRef TuneCPU, StringRef FS, 292 const TargetMachine &TM, bool LittleEndian, 293 unsigned MinSVEVectorSizeInBitsOverride, 294 unsigned MaxSVEVectorSizeInBitsOverride, 295 bool StreamingSVEMode, 296 bool StreamingCompatibleSVEMode) 297 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 298 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 299 ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()), 300 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 301 IsLittle(LittleEndian), 302 StreamingSVEMode(StreamingSVEMode), 303 StreamingCompatibleSVEMode(StreamingCompatibleSVEMode), 304 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 305 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 306 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 307 TLInfo(TM, *this) { 308 if (AArch64::isX18ReservedByDefault(TT)) 309 ReserveXRegister.set(18); 310 311 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 312 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 313 Legalizer.reset(new AArch64LegalizerInfo(*this)); 314 315 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 316 317 // FIXME: At this point, we can't rely on Subtarget having RBI. 318 // It's awkward to mix passing RBI and the Subtarget; should we pass 319 // TII/TRI as well? 320 InstSelector.reset(createAArch64InstructionSelector( 321 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 322 323 RegBankInfo.reset(RBI); 324 325 auto TRI = getRegisterInfo(); 326 StringSet<> ReservedRegNames; 327 ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end()); 328 for (unsigned i = 0; i < 29; ++i) { 329 if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i))) 330 ReserveXRegisterForRA.set(i); 331 } 332 // X30 is named LR, so we can't use TRI->getName to check X30. 333 if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR")) 334 ReserveXRegisterForRA.set(30); 335 // X29 is named FP, so we can't use TRI->getName to check X29. 336 if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP")) 337 ReserveXRegisterForRA.set(29); 338 } 339 340 const CallLowering *AArch64Subtarget::getCallLowering() const { 341 return CallLoweringInfo.get(); 342 } 343 344 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 345 return InlineAsmLoweringInfo.get(); 346 } 347 348 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 349 return InstSelector.get(); 350 } 351 352 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 353 return Legalizer.get(); 354 } 355 356 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 357 return RegBankInfo.get(); 358 } 359 360 /// Find the target operand flags that describe how a global value should be 361 /// referenced for the current subtarget. 362 unsigned 363 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 364 const TargetMachine &TM) const { 365 // MachO large model always goes via a GOT, simply to get a single 8-byte 366 // absolute relocation on all global addresses. 367 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 368 return AArch64II::MO_GOT; 369 370 // All globals dynamically protected by MTE must have their address tags 371 // synthesized. This is done by having the loader stash the tag in the GOT 372 // entry. Force all tagged globals (even ones with internal linkage) through 373 // the GOT. 374 if (GV->isTagged()) 375 return AArch64II::MO_GOT; 376 377 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 378 if (GV->hasDLLImportStorageClass()) { 379 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) 380 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX; 381 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 382 } 383 if (getTargetTriple().isOSWindows()) 384 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 385 return AArch64II::MO_GOT; 386 } 387 388 // The small code model's direct accesses use ADRP, which cannot 389 // necessarily produce the value 0 (if the code is above 4GB). 390 // Same for the tiny code model, where we have a pc relative LDR. 391 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 392 GV->hasExternalWeakLinkage()) 393 return AArch64II::MO_GOT; 394 395 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 396 // that their nominal addresses are tagged and outside of the code model. In 397 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 398 // tag if necessary based on MO_TAGGED. 399 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 400 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 401 402 return AArch64II::MO_NO_FLAG; 403 } 404 405 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 406 const GlobalValue *GV, const TargetMachine &TM) const { 407 // MachO large model always goes via a GOT, because we don't have the 408 // relocations available to do anything else.. 409 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 410 !GV->hasInternalLinkage()) 411 return AArch64II::MO_GOT; 412 413 // NonLazyBind goes via GOT unless we know it's available locally. 414 auto *F = dyn_cast<Function>(GV); 415 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 416 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 417 return AArch64II::MO_GOT; 418 419 if (getTargetTriple().isOSWindows()) { 420 if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() && 421 GV->hasDLLImportStorageClass()) { 422 // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT, 423 // not MO_DLLIMPORTAUX. 424 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 425 } 426 427 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 428 return ClassifyGlobalReference(GV, TM); 429 } 430 431 return AArch64II::MO_NO_FLAG; 432 } 433 434 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 435 unsigned NumRegionInstrs) const { 436 // LNT run (at least on Cyclone) showed reasonably significant gains for 437 // bi-directional scheduling. 253.perlbmk. 438 Policy.OnlyTopDown = false; 439 Policy.OnlyBottomUp = false; 440 // Enabling or Disabling the latency heuristic is a close call: It seems to 441 // help nearly no benchmark on out-of-order architectures, on the other hand 442 // it regresses register pressure on a few benchmarking. 443 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 444 } 445 446 bool AArch64Subtarget::enableEarlyIfConversion() const { 447 return EnableEarlyIfConvert; 448 } 449 450 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 451 if (!UseAddressTopByteIgnored) 452 return false; 453 454 if (TargetTriple.isDriverKit()) 455 return true; 456 if (TargetTriple.isiOS()) { 457 return TargetTriple.getiOSVersion() >= VersionTuple(8); 458 } 459 460 return false; 461 } 462 463 std::unique_ptr<PBQPRAConstraint> 464 AArch64Subtarget::getCustomPBQPConstraints() const { 465 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 466 } 467 468 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 469 // We usually compute max call frame size after ISel. Do the computation now 470 // if the .mir file didn't specify it. Note that this will probably give you 471 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 472 // instructions, specify explicitly if you need it to be correct. 473 MachineFrameInfo &MFI = MF.getFrameInfo(); 474 if (!MFI.isMaxCallFrameSizeComputed()) 475 MFI.computeMaxCallFrameSize(MF); 476 } 477 478 bool AArch64Subtarget::useAA() const { return UseAA; } 479 480 bool AArch64Subtarget::isNeonAvailable() const { 481 if (!hasNEON()) 482 return false; 483 484 // The 'force-streaming-comaptible-sve' flag overrides the streaming 485 // function attributes. 486 if (ForceStreamingCompatibleSVE.getNumOccurrences() > 0) 487 return !ForceStreamingCompatibleSVE; 488 489 return !isStreaming() && !isStreamingCompatible(); 490 } 491