xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/TargetParser/AArch64TargetParser.h"
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "aarch64-subtarget"
32 
33 #define GET_SUBTARGETINFO_CTOR
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #include "AArch64GenSubtargetInfo.inc"
36 
37 static cl::opt<bool>
38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
39                      "converter pass"), cl::init(true), cl::Hidden);
40 
41 // If OS supports TBI, use this flag to enable it.
42 static cl::opt<bool>
43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
44                          "an address is ignored"), cl::init(false), cl::Hidden);
45 
46 static cl::opt<bool>
47     UseNonLazyBind("aarch64-enable-nonlazybind",
48                    cl::desc("Call nonlazybind functions via direct GOT load"),
49                    cl::init(false), cl::Hidden);
50 
51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
52                            cl::desc("Enable the use of AA during codegen."));
53 
54 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
55     "aarch64-insert-extract-base-cost",
56     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
57 
58 // Reserve a list of X# registers, so they are unavailable for register
59 // allocator, but can still be used as ABI requests, such as passing arguments
60 // to function call.
61 static cl::list<std::string>
62 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
63                   "registers, so they can't be used by register allocator. "
64                   "Should only be used for testing register allocator."),
65                   cl::CommaSeparated, cl::Hidden);
66 
67 static cl::opt<bool> ForceStreamingCompatibleSVE(
68     "force-streaming-compatible-sve",
69     cl::desc(
70         "Force the use of streaming-compatible SVE code for all functions"),
71     cl::Hidden);
72 
73 static cl::opt<AArch64PAuth::AuthCheckMethod>
74     AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
75                                cl::Hidden,
76                                cl::desc("Override the variant of check applied "
77                                         "to authenticated LR during tail call"),
78                                cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR));
79 
80 static cl::opt<unsigned> AArch64MinimumJumpTableEntries(
81     "aarch64-min-jump-table-entries", cl::init(13), cl::Hidden,
82     cl::desc("Set minimum number of entries to use a jump table on AArch64"));
83 
84 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
85   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
86     return OverrideVectorInsertExtractBaseCost;
87   return VectorInsertExtractBaseCost;
88 }
89 
90 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
91     StringRef FS, StringRef CPUString, StringRef TuneCPUString,
92     bool HasMinSize) {
93   // Determine default and user-specified characteristics
94 
95   if (CPUString.empty())
96     CPUString = "generic";
97 
98   if (TuneCPUString.empty())
99     TuneCPUString = CPUString;
100 
101   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
102   initializeProperties(HasMinSize);
103 
104   return *this;
105 }
106 
107 void AArch64Subtarget::initializeProperties(bool HasMinSize) {
108   // Initialize CPU specific properties. We should add a tablegen feature for
109   // this in the future so we can specify it together with the subtarget
110   // features.
111   switch (ARMProcFamily) {
112   case Others:
113     break;
114   case Carmel:
115     CacheLineSize = 64;
116     break;
117   case CortexA35:
118   case CortexA53:
119   case CortexA55:
120     PrefFunctionAlignment = Align(16);
121     PrefLoopAlignment = Align(16);
122     MaxBytesForLoopAlignment = 8;
123     break;
124   case CortexA57:
125     MaxInterleaveFactor = 4;
126     PrefFunctionAlignment = Align(16);
127     PrefLoopAlignment = Align(16);
128     MaxBytesForLoopAlignment = 8;
129     break;
130   case CortexA65:
131     PrefFunctionAlignment = Align(8);
132     break;
133   case CortexA72:
134   case CortexA73:
135   case CortexA75:
136     PrefFunctionAlignment = Align(16);
137     PrefLoopAlignment = Align(16);
138     MaxBytesForLoopAlignment = 8;
139     break;
140   case CortexA76:
141   case CortexA77:
142   case CortexA78:
143   case CortexA78C:
144   case CortexR82:
145   case CortexX1:
146   case CortexX1C:
147     PrefFunctionAlignment = Align(16);
148     PrefLoopAlignment = Align(32);
149     MaxBytesForLoopAlignment = 16;
150     break;
151   case CortexA510:
152   case CortexA520:
153     PrefFunctionAlignment = Align(16);
154     VScaleForTuning = 1;
155     PrefLoopAlignment = Align(16);
156     MaxBytesForLoopAlignment = 8;
157     break;
158   case CortexA710:
159   case CortexA715:
160   case CortexA720:
161   case CortexX2:
162   case CortexX3:
163   case CortexX4:
164     PrefFunctionAlignment = Align(16);
165     VScaleForTuning = 1;
166     PrefLoopAlignment = Align(32);
167     MaxBytesForLoopAlignment = 16;
168     break;
169   case A64FX:
170     CacheLineSize = 256;
171     PrefFunctionAlignment = Align(8);
172     PrefLoopAlignment = Align(4);
173     MaxInterleaveFactor = 4;
174     PrefetchDistance = 128;
175     MinPrefetchStride = 1024;
176     MaxPrefetchIterationsAhead = 4;
177     VScaleForTuning = 4;
178     break;
179   case AppleA7:
180   case AppleA10:
181   case AppleA11:
182   case AppleA12:
183   case AppleA13:
184   case AppleA14:
185   case AppleA15:
186   case AppleA16:
187   case AppleA17:
188     CacheLineSize = 64;
189     PrefetchDistance = 280;
190     MinPrefetchStride = 2048;
191     MaxPrefetchIterationsAhead = 3;
192     switch (ARMProcFamily) {
193     case AppleA14:
194     case AppleA15:
195     case AppleA16:
196     case AppleA17:
197       MaxInterleaveFactor = 4;
198       break;
199     default:
200       break;
201     }
202     break;
203   case ExynosM3:
204     MaxInterleaveFactor = 4;
205     MaxJumpTableSize = 20;
206     PrefFunctionAlignment = Align(32);
207     PrefLoopAlignment = Align(16);
208     break;
209   case Falkor:
210     MaxInterleaveFactor = 4;
211     // FIXME: remove this to enable 64-bit SLP if performance looks good.
212     MinVectorRegisterBitWidth = 128;
213     CacheLineSize = 128;
214     PrefetchDistance = 820;
215     MinPrefetchStride = 2048;
216     MaxPrefetchIterationsAhead = 8;
217     break;
218   case Kryo:
219     MaxInterleaveFactor = 4;
220     VectorInsertExtractBaseCost = 2;
221     CacheLineSize = 128;
222     PrefetchDistance = 740;
223     MinPrefetchStride = 1024;
224     MaxPrefetchIterationsAhead = 11;
225     // FIXME: remove this to enable 64-bit SLP if performance looks good.
226     MinVectorRegisterBitWidth = 128;
227     break;
228   case NeoverseE1:
229     PrefFunctionAlignment = Align(8);
230     break;
231   case NeoverseN1:
232     PrefFunctionAlignment = Align(16);
233     PrefLoopAlignment = Align(32);
234     MaxBytesForLoopAlignment = 16;
235     break;
236   case NeoverseN2:
237   case NeoverseV2:
238     PrefFunctionAlignment = Align(16);
239     PrefLoopAlignment = Align(32);
240     MaxBytesForLoopAlignment = 16;
241     VScaleForTuning = 1;
242     break;
243   case NeoverseV1:
244     PrefFunctionAlignment = Align(16);
245     PrefLoopAlignment = Align(32);
246     MaxBytesForLoopAlignment = 16;
247     VScaleForTuning = 2;
248     DefaultSVETFOpts = TailFoldingOpts::Simple;
249     break;
250   case Neoverse512TVB:
251     PrefFunctionAlignment = Align(16);
252     VScaleForTuning = 1;
253     MaxInterleaveFactor = 4;
254     break;
255   case Saphira:
256     MaxInterleaveFactor = 4;
257     // FIXME: remove this to enable 64-bit SLP if performance looks good.
258     MinVectorRegisterBitWidth = 128;
259     break;
260   case ThunderX2T99:
261     CacheLineSize = 64;
262     PrefFunctionAlignment = Align(8);
263     PrefLoopAlignment = Align(4);
264     MaxInterleaveFactor = 4;
265     PrefetchDistance = 128;
266     MinPrefetchStride = 1024;
267     MaxPrefetchIterationsAhead = 4;
268     // FIXME: remove this to enable 64-bit SLP if performance looks good.
269     MinVectorRegisterBitWidth = 128;
270     break;
271   case ThunderX:
272   case ThunderXT88:
273   case ThunderXT81:
274   case ThunderXT83:
275     CacheLineSize = 128;
276     PrefFunctionAlignment = Align(8);
277     PrefLoopAlignment = Align(4);
278     // FIXME: remove this to enable 64-bit SLP if performance looks good.
279     MinVectorRegisterBitWidth = 128;
280     break;
281   case TSV110:
282     CacheLineSize = 64;
283     PrefFunctionAlignment = Align(16);
284     PrefLoopAlignment = Align(4);
285     break;
286   case ThunderX3T110:
287     CacheLineSize = 64;
288     PrefFunctionAlignment = Align(16);
289     PrefLoopAlignment = Align(4);
290     MaxInterleaveFactor = 4;
291     PrefetchDistance = 128;
292     MinPrefetchStride = 1024;
293     MaxPrefetchIterationsAhead = 4;
294     // FIXME: remove this to enable 64-bit SLP if performance looks good.
295     MinVectorRegisterBitWidth = 128;
296     break;
297   case Ampere1:
298   case Ampere1A:
299     CacheLineSize = 64;
300     PrefFunctionAlignment = Align(64);
301     PrefLoopAlignment = Align(64);
302     MaxInterleaveFactor = 4;
303     break;
304   }
305 
306   if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
307     MinimumJumpTableEntries = AArch64MinimumJumpTableEntries;
308 }
309 
310 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
311                                    StringRef TuneCPU, StringRef FS,
312                                    const TargetMachine &TM, bool LittleEndian,
313                                    unsigned MinSVEVectorSizeInBitsOverride,
314                                    unsigned MaxSVEVectorSizeInBitsOverride,
315                                    bool StreamingSVEMode,
316                                    bool StreamingCompatibleSVEMode,
317                                    bool HasMinSize)
318     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
319       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
320       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
321       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
322       IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode),
323       StreamingCompatibleSVEMode(StreamingCompatibleSVEMode),
324       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
325       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
326       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
327       TLInfo(TM, *this) {
328   if (AArch64::isX18ReservedByDefault(TT))
329     ReserveXRegister.set(18);
330 
331   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
332   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
333   Legalizer.reset(new AArch64LegalizerInfo(*this));
334 
335   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
336 
337   // FIXME: At this point, we can't rely on Subtarget having RBI.
338   // It's awkward to mix passing RBI and the Subtarget; should we pass
339   // TII/TRI as well?
340   InstSelector.reset(createAArch64InstructionSelector(
341       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
342 
343   RegBankInfo.reset(RBI);
344 
345   auto TRI = getRegisterInfo();
346   StringSet<> ReservedRegNames;
347   ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end());
348   for (unsigned i = 0; i < 29; ++i) {
349     if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
350       ReserveXRegisterForRA.set(i);
351   }
352   // X30 is named LR, so we can't use TRI->getName to check X30.
353   if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
354     ReserveXRegisterForRA.set(30);
355   // X29 is named FP, so we can't use TRI->getName to check X29.
356   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
357     ReserveXRegisterForRA.set(29);
358 
359   AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM));
360 }
361 
362 const CallLowering *AArch64Subtarget::getCallLowering() const {
363   return CallLoweringInfo.get();
364 }
365 
366 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
367   return InlineAsmLoweringInfo.get();
368 }
369 
370 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
371   return InstSelector.get();
372 }
373 
374 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
375   return Legalizer.get();
376 }
377 
378 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
379   return RegBankInfo.get();
380 }
381 
382 /// Find the target operand flags that describe how a global value should be
383 /// referenced for the current subtarget.
384 unsigned
385 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
386                                           const TargetMachine &TM) const {
387   // MachO large model always goes via a GOT, simply to get a single 8-byte
388   // absolute relocation on all global addresses.
389   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
390     return AArch64II::MO_GOT;
391 
392   // All globals dynamically protected by MTE must have their address tags
393   // synthesized. This is done by having the loader stash the tag in the GOT
394   // entry. Force all tagged globals (even ones with internal linkage) through
395   // the GOT.
396   if (GV->isTagged())
397     return AArch64II::MO_GOT;
398 
399   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
400     if (GV->hasDLLImportStorageClass()) {
401       if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy())
402         return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX;
403       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
404     }
405     if (getTargetTriple().isOSWindows())
406       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
407     return AArch64II::MO_GOT;
408   }
409 
410   // The small code model's direct accesses use ADRP, which cannot
411   // necessarily produce the value 0 (if the code is above 4GB).
412   // Same for the tiny code model, where we have a pc relative LDR.
413   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
414       GV->hasExternalWeakLinkage())
415     return AArch64II::MO_GOT;
416 
417   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
418   // that their nominal addresses are tagged and outside of the code model. In
419   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
420   // tag if necessary based on MO_TAGGED.
421   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
422     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
423 
424   return AArch64II::MO_NO_FLAG;
425 }
426 
427 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
428     const GlobalValue *GV, const TargetMachine &TM) const {
429   // MachO large model always goes via a GOT, because we don't have the
430   // relocations available to do anything else..
431   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
432       !GV->hasInternalLinkage())
433     return AArch64II::MO_GOT;
434 
435   // NonLazyBind goes via GOT unless we know it's available locally.
436   auto *F = dyn_cast<Function>(GV);
437   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
438       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
439     return AArch64II::MO_GOT;
440 
441   if (getTargetTriple().isOSWindows()) {
442     if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() &&
443         GV->hasDLLImportStorageClass()) {
444       // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT,
445       // not MO_DLLIMPORTAUX.
446       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
447     }
448 
449     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
450     return ClassifyGlobalReference(GV, TM);
451   }
452 
453   return AArch64II::MO_NO_FLAG;
454 }
455 
456 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
457                                            unsigned NumRegionInstrs) const {
458   // LNT run (at least on Cyclone) showed reasonably significant gains for
459   // bi-directional scheduling. 253.perlbmk.
460   Policy.OnlyTopDown = false;
461   Policy.OnlyBottomUp = false;
462   // Enabling or Disabling the latency heuristic is a close call: It seems to
463   // help nearly no benchmark on out-of-order architectures, on the other hand
464   // it regresses register pressure on a few benchmarking.
465   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
466 }
467 
468 bool AArch64Subtarget::enableEarlyIfConversion() const {
469   return EnableEarlyIfConvert;
470 }
471 
472 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
473   if (!UseAddressTopByteIgnored)
474     return false;
475 
476   if (TargetTriple.isDriverKit())
477     return true;
478   if (TargetTriple.isiOS()) {
479     return TargetTriple.getiOSVersion() >= VersionTuple(8);
480   }
481 
482   return false;
483 }
484 
485 std::unique_ptr<PBQPRAConstraint>
486 AArch64Subtarget::getCustomPBQPConstraints() const {
487   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
488 }
489 
490 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
491   // We usually compute max call frame size after ISel. Do the computation now
492   // if the .mir file didn't specify it. Note that this will probably give you
493   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
494   // instructions, specify explicitly if you need it to be correct.
495   MachineFrameInfo &MFI = MF.getFrameInfo();
496   if (!MFI.isMaxCallFrameSizeComputed())
497     MFI.computeMaxCallFrameSize(MF);
498 }
499 
500 bool AArch64Subtarget::useAA() const { return UseAA; }
501 
502 bool AArch64Subtarget::isStreamingCompatible() const {
503   return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE;
504 }
505 
506 bool AArch64Subtarget::isNeonAvailable() const {
507   return hasNEON() &&
508          (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
509 }
510 
511 bool AArch64Subtarget::isSVEAvailable() const {
512   return hasSVE() &&
513          (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
514 }
515 
516 // If return address signing is enabled, tail calls are emitted as follows:
517 //
518 // ```
519 //   <authenticate LR>
520 //   <check LR>
521 //   TCRETURN          ; the callee may sign and spill the LR in its prologue
522 // ```
523 //
524 // LR may require explicit checking because if FEAT_FPAC is not implemented
525 // and LR was tampered with, then `<authenticate LR>` will not generate an
526 // exception on its own. Later, if the callee spills the signed LR value and
527 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
528 // the higher bits of LR thus hiding the authentication failure.
529 AArch64PAuth::AuthCheckMethod
530 AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
531   if (AuthenticatedLRCheckMethod.getNumOccurrences())
532     return AuthenticatedLRCheckMethod;
533 
534   // At now, use None by default because checks may introduce an unexpected
535   // performance regression or incompatibility with execute-only mappings.
536   return AArch64PAuth::AuthCheckMethod::None;
537 }
538