1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineFrameInfo.h" 25 #include "llvm/CodeGen/MachineScheduler.h" 26 #include "llvm/IR/GlobalValue.h" 27 #include "llvm/Support/AArch64TargetParser.h" 28 #include "llvm/Support/TargetParser.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "aarch64-subtarget" 33 34 #define GET_SUBTARGETINFO_CTOR 35 #define GET_SUBTARGETINFO_TARGET_DESC 36 #include "AArch64GenSubtargetInfo.inc" 37 38 static cl::opt<bool> 39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 40 "converter pass"), cl::init(true), cl::Hidden); 41 42 // If OS supports TBI, use this flag to enable it. 43 static cl::opt<bool> 44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 45 "an address is ignored"), cl::init(false), cl::Hidden); 46 47 static cl::opt<bool> 48 UseNonLazyBind("aarch64-enable-nonlazybind", 49 cl::desc("Call nonlazybind functions via direct GOT load"), 50 cl::init(false), cl::Hidden); 51 52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 53 cl::desc("Enable the use of AA during codegen.")); 54 55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost( 56 "aarch64-insert-extract-base-cost", 57 cl::desc("Base cost of vector insert/extract element"), cl::Hidden); 58 59 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { 60 if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) 61 return OverrideVectorInsertExtractBaseCost; 62 return VectorInsertExtractBaseCost; 63 } 64 65 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies( 66 StringRef FS, StringRef CPUString, StringRef TuneCPUString) { 67 // Determine default and user-specified characteristics 68 69 if (CPUString.empty()) 70 CPUString = "generic"; 71 72 if (TuneCPUString.empty()) 73 TuneCPUString = CPUString; 74 75 ParseSubtargetFeatures(CPUString, TuneCPUString, FS); 76 initializeProperties(); 77 78 return *this; 79 } 80 81 void AArch64Subtarget::initializeProperties() { 82 // Initialize CPU specific properties. We should add a tablegen feature for 83 // this in the future so we can specify it together with the subtarget 84 // features. 85 switch (ARMProcFamily) { 86 case Others: 87 break; 88 case Carmel: 89 CacheLineSize = 64; 90 break; 91 case CortexA35: 92 case CortexA53: 93 case CortexA55: 94 PrefFunctionLogAlignment = 4; 95 PrefLoopLogAlignment = 4; 96 MaxBytesForLoopAlignment = 8; 97 break; 98 case CortexA57: 99 MaxInterleaveFactor = 4; 100 PrefFunctionLogAlignment = 4; 101 PrefLoopLogAlignment = 4; 102 MaxBytesForLoopAlignment = 8; 103 break; 104 case CortexA65: 105 PrefFunctionLogAlignment = 3; 106 break; 107 case CortexA72: 108 case CortexA73: 109 case CortexA75: 110 PrefFunctionLogAlignment = 4; 111 PrefLoopLogAlignment = 4; 112 MaxBytesForLoopAlignment = 8; 113 break; 114 case CortexA76: 115 case CortexA77: 116 case CortexA78: 117 case CortexA78C: 118 case CortexR82: 119 case CortexX1: 120 case CortexX1C: 121 PrefFunctionLogAlignment = 4; 122 PrefLoopLogAlignment = 5; 123 MaxBytesForLoopAlignment = 16; 124 break; 125 case CortexA510: 126 PrefFunctionLogAlignment = 4; 127 VScaleForTuning = 1; 128 PrefLoopLogAlignment = 4; 129 MaxBytesForLoopAlignment = 8; 130 break; 131 case CortexA710: 132 case CortexX2: 133 PrefFunctionLogAlignment = 4; 134 VScaleForTuning = 1; 135 PrefLoopLogAlignment = 5; 136 MaxBytesForLoopAlignment = 16; 137 break; 138 case A64FX: 139 CacheLineSize = 256; 140 PrefFunctionLogAlignment = 3; 141 PrefLoopLogAlignment = 2; 142 MaxInterleaveFactor = 4; 143 PrefetchDistance = 128; 144 MinPrefetchStride = 1024; 145 MaxPrefetchIterationsAhead = 4; 146 VScaleForTuning = 4; 147 break; 148 case AppleA7: 149 case AppleA10: 150 case AppleA11: 151 case AppleA12: 152 case AppleA13: 153 case AppleA14: 154 CacheLineSize = 64; 155 PrefetchDistance = 280; 156 MinPrefetchStride = 2048; 157 MaxPrefetchIterationsAhead = 3; 158 break; 159 case ExynosM3: 160 MaxInterleaveFactor = 4; 161 MaxJumpTableSize = 20; 162 PrefFunctionLogAlignment = 5; 163 PrefLoopLogAlignment = 4; 164 break; 165 case Falkor: 166 MaxInterleaveFactor = 4; 167 // FIXME: remove this to enable 64-bit SLP if performance looks good. 168 MinVectorRegisterBitWidth = 128; 169 CacheLineSize = 128; 170 PrefetchDistance = 820; 171 MinPrefetchStride = 2048; 172 MaxPrefetchIterationsAhead = 8; 173 break; 174 case Kryo: 175 MaxInterleaveFactor = 4; 176 VectorInsertExtractBaseCost = 2; 177 CacheLineSize = 128; 178 PrefetchDistance = 740; 179 MinPrefetchStride = 1024; 180 MaxPrefetchIterationsAhead = 11; 181 // FIXME: remove this to enable 64-bit SLP if performance looks good. 182 MinVectorRegisterBitWidth = 128; 183 break; 184 case NeoverseE1: 185 PrefFunctionLogAlignment = 3; 186 break; 187 case NeoverseN1: 188 PrefFunctionLogAlignment = 4; 189 PrefLoopLogAlignment = 5; 190 MaxBytesForLoopAlignment = 16; 191 break; 192 case NeoverseN2: 193 PrefFunctionLogAlignment = 4; 194 PrefLoopLogAlignment = 5; 195 MaxBytesForLoopAlignment = 16; 196 VScaleForTuning = 1; 197 break; 198 case NeoverseV1: 199 PrefFunctionLogAlignment = 4; 200 PrefLoopLogAlignment = 5; 201 MaxBytesForLoopAlignment = 16; 202 VScaleForTuning = 2; 203 break; 204 case Neoverse512TVB: 205 PrefFunctionLogAlignment = 4; 206 VScaleForTuning = 1; 207 MaxInterleaveFactor = 4; 208 break; 209 case Saphira: 210 MaxInterleaveFactor = 4; 211 // FIXME: remove this to enable 64-bit SLP if performance looks good. 212 MinVectorRegisterBitWidth = 128; 213 break; 214 case ThunderX2T99: 215 CacheLineSize = 64; 216 PrefFunctionLogAlignment = 3; 217 PrefLoopLogAlignment = 2; 218 MaxInterleaveFactor = 4; 219 PrefetchDistance = 128; 220 MinPrefetchStride = 1024; 221 MaxPrefetchIterationsAhead = 4; 222 // FIXME: remove this to enable 64-bit SLP if performance looks good. 223 MinVectorRegisterBitWidth = 128; 224 break; 225 case ThunderX: 226 case ThunderXT88: 227 case ThunderXT81: 228 case ThunderXT83: 229 CacheLineSize = 128; 230 PrefFunctionLogAlignment = 3; 231 PrefLoopLogAlignment = 2; 232 // FIXME: remove this to enable 64-bit SLP if performance looks good. 233 MinVectorRegisterBitWidth = 128; 234 break; 235 case TSV110: 236 CacheLineSize = 64; 237 PrefFunctionLogAlignment = 4; 238 PrefLoopLogAlignment = 2; 239 break; 240 case ThunderX3T110: 241 CacheLineSize = 64; 242 PrefFunctionLogAlignment = 4; 243 PrefLoopLogAlignment = 2; 244 MaxInterleaveFactor = 4; 245 PrefetchDistance = 128; 246 MinPrefetchStride = 1024; 247 MaxPrefetchIterationsAhead = 4; 248 // FIXME: remove this to enable 64-bit SLP if performance looks good. 249 MinVectorRegisterBitWidth = 128; 250 break; 251 case Ampere1: 252 CacheLineSize = 64; 253 PrefFunctionLogAlignment = 6; 254 PrefLoopLogAlignment = 6; 255 MaxInterleaveFactor = 4; 256 break; 257 } 258 } 259 260 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 261 const std::string &TuneCPU, 262 const std::string &FS, 263 const TargetMachine &TM, bool LittleEndian, 264 unsigned MinSVEVectorSizeInBitsOverride, 265 unsigned MaxSVEVectorSizeInBitsOverride) 266 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS), 267 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 268 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 269 IsLittle(LittleEndian), 270 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride), 271 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT), 272 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), 273 TLInfo(TM, *this) { 274 if (AArch64::isX18ReservedByDefault(TT)) 275 ReserveXRegister.set(18); 276 277 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 278 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 279 Legalizer.reset(new AArch64LegalizerInfo(*this)); 280 281 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 282 283 // FIXME: At this point, we can't rely on Subtarget having RBI. 284 // It's awkward to mix passing RBI and the Subtarget; should we pass 285 // TII/TRI as well? 286 InstSelector.reset(createAArch64InstructionSelector( 287 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 288 289 RegBankInfo.reset(RBI); 290 } 291 292 const CallLowering *AArch64Subtarget::getCallLowering() const { 293 return CallLoweringInfo.get(); 294 } 295 296 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 297 return InlineAsmLoweringInfo.get(); 298 } 299 300 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 301 return InstSelector.get(); 302 } 303 304 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 305 return Legalizer.get(); 306 } 307 308 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 309 return RegBankInfo.get(); 310 } 311 312 /// Find the target operand flags that describe how a global value should be 313 /// referenced for the current subtarget. 314 unsigned 315 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 316 const TargetMachine &TM) const { 317 // MachO large model always goes via a GOT, simply to get a single 8-byte 318 // absolute relocation on all global addresses. 319 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 320 return AArch64II::MO_GOT; 321 322 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 323 if (GV->hasDLLImportStorageClass()) 324 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 325 if (getTargetTriple().isOSWindows()) 326 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 327 return AArch64II::MO_GOT; 328 } 329 330 // The small code model's direct accesses use ADRP, which cannot 331 // necessarily produce the value 0 (if the code is above 4GB). 332 // Same for the tiny code model, where we have a pc relative LDR. 333 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 334 GV->hasExternalWeakLinkage()) 335 return AArch64II::MO_GOT; 336 337 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 338 // that their nominal addresses are tagged and outside of the code model. In 339 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 340 // tag if necessary based on MO_TAGGED. 341 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 342 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 343 344 return AArch64II::MO_NO_FLAG; 345 } 346 347 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 348 const GlobalValue *GV, const TargetMachine &TM) const { 349 // MachO large model always goes via a GOT, because we don't have the 350 // relocations available to do anything else.. 351 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 352 !GV->hasInternalLinkage()) 353 return AArch64II::MO_GOT; 354 355 // NonLazyBind goes via GOT unless we know it's available locally. 356 auto *F = dyn_cast<Function>(GV); 357 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 358 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 359 return AArch64II::MO_GOT; 360 361 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 362 if (getTargetTriple().isOSWindows()) 363 return ClassifyGlobalReference(GV, TM); 364 365 return AArch64II::MO_NO_FLAG; 366 } 367 368 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 369 unsigned NumRegionInstrs) const { 370 // LNT run (at least on Cyclone) showed reasonably significant gains for 371 // bi-directional scheduling. 253.perlbmk. 372 Policy.OnlyTopDown = false; 373 Policy.OnlyBottomUp = false; 374 // Enabling or Disabling the latency heuristic is a close call: It seems to 375 // help nearly no benchmark on out-of-order architectures, on the other hand 376 // it regresses register pressure on a few benchmarking. 377 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 378 } 379 380 bool AArch64Subtarget::enableEarlyIfConversion() const { 381 return EnableEarlyIfConvert; 382 } 383 384 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 385 if (!UseAddressTopByteIgnored) 386 return false; 387 388 if (TargetTriple.isDriverKit()) 389 return true; 390 if (TargetTriple.isiOS()) { 391 return TargetTriple.getiOSVersion() >= VersionTuple(8); 392 } 393 394 return false; 395 } 396 397 std::unique_ptr<PBQPRAConstraint> 398 AArch64Subtarget::getCustomPBQPConstraints() const { 399 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 400 } 401 402 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 403 // We usually compute max call frame size after ISel. Do the computation now 404 // if the .mir file didn't specify it. Note that this will probably give you 405 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 406 // instructions, specify explicitly if you need it to be correct. 407 MachineFrameInfo &MFI = MF.getFrameInfo(); 408 if (!MFI.isMaxCallFrameSizeComputed()) 409 MFI.computeMaxCallFrameSize(MF); 410 } 411 412 bool AArch64Subtarget::useAA() const { return UseAA; } 413