1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<unsigned> SVEVectorBitsMax( 51 "aarch64-sve-vector-bits-max", 52 cl::desc("Assume SVE vector registers are at most this big, " 53 "with zero meaning no maximum size is assumed."), 54 cl::init(0), cl::Hidden); 55 56 static cl::opt<unsigned> SVEVectorBitsMin( 57 "aarch64-sve-vector-bits-min", 58 cl::desc("Assume SVE vector registers are at least this big, " 59 "with zero meaning no minimum size is assumed."), 60 cl::init(0), cl::Hidden); 61 62 AArch64Subtarget & 63 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, 64 StringRef CPUString) { 65 // Determine default and user-specified characteristics 66 67 if (CPUString.empty()) 68 CPUString = "generic"; 69 70 ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); 71 initializeProperties(); 72 73 return *this; 74 } 75 76 void AArch64Subtarget::initializeProperties() { 77 // Initialize CPU specific properties. We should add a tablegen feature for 78 // this in the future so we can specify it together with the subtarget 79 // features. 80 switch (ARMProcFamily) { 81 case Others: 82 break; 83 case Carmel: 84 CacheLineSize = 64; 85 break; 86 case CortexA35: 87 break; 88 case CortexA53: 89 PrefFunctionLogAlignment = 3; 90 break; 91 case CortexA55: 92 break; 93 case CortexA57: 94 MaxInterleaveFactor = 4; 95 PrefFunctionLogAlignment = 4; 96 break; 97 case CortexA65: 98 PrefFunctionLogAlignment = 3; 99 break; 100 case CortexA72: 101 case CortexA73: 102 case CortexA75: 103 case CortexA76: 104 case CortexA77: 105 case CortexA78: 106 case CortexA78C: 107 case CortexR82: 108 case CortexX1: 109 PrefFunctionLogAlignment = 4; 110 break; 111 case A64FX: 112 CacheLineSize = 256; 113 PrefFunctionLogAlignment = 3; 114 PrefLoopLogAlignment = 2; 115 MaxInterleaveFactor = 4; 116 PrefetchDistance = 128; 117 MinPrefetchStride = 1024; 118 MaxPrefetchIterationsAhead = 4; 119 break; 120 case AppleA7: 121 case AppleA10: 122 case AppleA11: 123 case AppleA12: 124 case AppleA13: 125 case AppleA14: 126 CacheLineSize = 64; 127 PrefetchDistance = 280; 128 MinPrefetchStride = 2048; 129 MaxPrefetchIterationsAhead = 3; 130 break; 131 case ExynosM3: 132 MaxInterleaveFactor = 4; 133 MaxJumpTableSize = 20; 134 PrefFunctionLogAlignment = 5; 135 PrefLoopLogAlignment = 4; 136 break; 137 case Falkor: 138 MaxInterleaveFactor = 4; 139 // FIXME: remove this to enable 64-bit SLP if performance looks good. 140 MinVectorRegisterBitWidth = 128; 141 CacheLineSize = 128; 142 PrefetchDistance = 820; 143 MinPrefetchStride = 2048; 144 MaxPrefetchIterationsAhead = 8; 145 break; 146 case Kryo: 147 MaxInterleaveFactor = 4; 148 VectorInsertExtractBaseCost = 2; 149 CacheLineSize = 128; 150 PrefetchDistance = 740; 151 MinPrefetchStride = 1024; 152 MaxPrefetchIterationsAhead = 11; 153 // FIXME: remove this to enable 64-bit SLP if performance looks good. 154 MinVectorRegisterBitWidth = 128; 155 break; 156 case NeoverseE1: 157 PrefFunctionLogAlignment = 3; 158 break; 159 case NeoverseN1: 160 case NeoverseN2: 161 case NeoverseV1: 162 PrefFunctionLogAlignment = 4; 163 break; 164 case Saphira: 165 MaxInterleaveFactor = 4; 166 // FIXME: remove this to enable 64-bit SLP if performance looks good. 167 MinVectorRegisterBitWidth = 128; 168 break; 169 case ThunderX2T99: 170 CacheLineSize = 64; 171 PrefFunctionLogAlignment = 3; 172 PrefLoopLogAlignment = 2; 173 MaxInterleaveFactor = 4; 174 PrefetchDistance = 128; 175 MinPrefetchStride = 1024; 176 MaxPrefetchIterationsAhead = 4; 177 // FIXME: remove this to enable 64-bit SLP if performance looks good. 178 MinVectorRegisterBitWidth = 128; 179 break; 180 case ThunderX: 181 case ThunderXT88: 182 case ThunderXT81: 183 case ThunderXT83: 184 CacheLineSize = 128; 185 PrefFunctionLogAlignment = 3; 186 PrefLoopLogAlignment = 2; 187 // FIXME: remove this to enable 64-bit SLP if performance looks good. 188 MinVectorRegisterBitWidth = 128; 189 break; 190 case TSV110: 191 CacheLineSize = 64; 192 PrefFunctionLogAlignment = 4; 193 PrefLoopLogAlignment = 2; 194 break; 195 case ThunderX3T110: 196 CacheLineSize = 64; 197 PrefFunctionLogAlignment = 4; 198 PrefLoopLogAlignment = 2; 199 MaxInterleaveFactor = 4; 200 PrefetchDistance = 128; 201 MinPrefetchStride = 1024; 202 MaxPrefetchIterationsAhead = 4; 203 // FIXME: remove this to enable 64-bit SLP if performance looks good. 204 MinVectorRegisterBitWidth = 128; 205 break; 206 } 207 } 208 209 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 210 const std::string &FS, 211 const TargetMachine &TM, bool LittleEndian) 212 : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), 213 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 214 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 215 IsLittle(LittleEndian), 216 TargetTriple(TT), FrameLowering(), 217 InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), 218 TLInfo(TM, *this) { 219 if (AArch64::isX18ReservedByDefault(TT)) 220 ReserveXRegister.set(18); 221 222 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 223 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 224 Legalizer.reset(new AArch64LegalizerInfo(*this)); 225 226 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 227 228 // FIXME: At this point, we can't rely on Subtarget having RBI. 229 // It's awkward to mix passing RBI and the Subtarget; should we pass 230 // TII/TRI as well? 231 InstSelector.reset(createAArch64InstructionSelector( 232 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 233 234 RegBankInfo.reset(RBI); 235 } 236 237 const CallLowering *AArch64Subtarget::getCallLowering() const { 238 return CallLoweringInfo.get(); 239 } 240 241 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 242 return InlineAsmLoweringInfo.get(); 243 } 244 245 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 246 return InstSelector.get(); 247 } 248 249 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 250 return Legalizer.get(); 251 } 252 253 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 254 return RegBankInfo.get(); 255 } 256 257 /// Find the target operand flags that describe how a global value should be 258 /// referenced for the current subtarget. 259 unsigned 260 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 261 const TargetMachine &TM) const { 262 // MachO large model always goes via a GOT, simply to get a single 8-byte 263 // absolute relocation on all global addresses. 264 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 265 return AArch64II::MO_GOT; 266 267 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 268 if (GV->hasDLLImportStorageClass()) 269 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 270 if (getTargetTriple().isOSWindows()) 271 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 272 return AArch64II::MO_GOT; 273 } 274 275 // The small code model's direct accesses use ADRP, which cannot 276 // necessarily produce the value 0 (if the code is above 4GB). 277 // Same for the tiny code model, where we have a pc relative LDR. 278 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 279 GV->hasExternalWeakLinkage()) 280 return AArch64II::MO_GOT; 281 282 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 283 // that their nominal addresses are tagged and outside of the code model. In 284 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 285 // tag if necessary based on MO_TAGGED. 286 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 287 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 288 289 return AArch64II::MO_NO_FLAG; 290 } 291 292 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 293 const GlobalValue *GV, const TargetMachine &TM) const { 294 // MachO large model always goes via a GOT, because we don't have the 295 // relocations available to do anything else.. 296 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 297 !GV->hasInternalLinkage()) 298 return AArch64II::MO_GOT; 299 300 // NonLazyBind goes via GOT unless we know it's available locally. 301 auto *F = dyn_cast<Function>(GV); 302 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 303 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 304 return AArch64II::MO_GOT; 305 306 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 307 if (getTargetTriple().isOSWindows()) 308 return ClassifyGlobalReference(GV, TM); 309 310 return AArch64II::MO_NO_FLAG; 311 } 312 313 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 314 unsigned NumRegionInstrs) const { 315 // LNT run (at least on Cyclone) showed reasonably significant gains for 316 // bi-directional scheduling. 253.perlbmk. 317 Policy.OnlyTopDown = false; 318 Policy.OnlyBottomUp = false; 319 // Enabling or Disabling the latency heuristic is a close call: It seems to 320 // help nearly no benchmark on out-of-order architectures, on the other hand 321 // it regresses register pressure on a few benchmarking. 322 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 323 } 324 325 bool AArch64Subtarget::enableEarlyIfConversion() const { 326 return EnableEarlyIfConvert; 327 } 328 329 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 330 if (!UseAddressTopByteIgnored) 331 return false; 332 333 if (TargetTriple.isiOS()) { 334 unsigned Major, Minor, Micro; 335 TargetTriple.getiOSVersion(Major, Minor, Micro); 336 return Major >= 8; 337 } 338 339 return false; 340 } 341 342 std::unique_ptr<PBQPRAConstraint> 343 AArch64Subtarget::getCustomPBQPConstraints() const { 344 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 345 } 346 347 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 348 // We usually compute max call frame size after ISel. Do the computation now 349 // if the .mir file didn't specify it. Note that this will probably give you 350 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 351 // instructions, specify explicitly if you need it to be correct. 352 MachineFrameInfo &MFI = MF.getFrameInfo(); 353 if (!MFI.isMaxCallFrameSizeComputed()) 354 MFI.computeMaxCallFrameSize(MF); 355 } 356 357 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { 358 assert(HasSVE && "Tried to get SVE vector length without SVE support!"); 359 assert(SVEVectorBitsMax % 128 == 0 && 360 "SVE requires vector length in multiples of 128!"); 361 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && 362 "Minimum SVE vector size should not be larger than its maximum!"); 363 if (SVEVectorBitsMax == 0) 364 return 0; 365 return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; 366 } 367 368 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { 369 assert(HasSVE && "Tried to get SVE vector length without SVE support!"); 370 assert(SVEVectorBitsMin % 128 == 0 && 371 "SVE requires vector length in multiples of 128!"); 372 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && 373 "Minimum SVE vector size should not be larger than its maximum!"); 374 if (SVEVectorBitsMax == 0) 375 return (SVEVectorBitsMin / 128) * 128; 376 return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; 377 } 378 379 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 380 // Prefer NEON unless larger SVE registers are available. 381 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 382 } 383