xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/SipHash.h"
28 #include "llvm/TargetParser/AArch64TargetParser.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "aarch64-subtarget"
33 
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37 
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40                      "converter pass"), cl::init(true), cl::Hidden);
41 
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45                          "an address is ignored"), cl::init(false), cl::Hidden);
46 
47 static cl::opt<bool> MachOUseNonLazyBind(
48     "aarch64-macho-enable-nonlazybind",
49     cl::desc("Call nonlazybind functions via direct GOT load for Mach-O"),
50     cl::Hidden);
51 
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53                            cl::desc("Enable the use of AA during codegen."));
54 
55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
56     "aarch64-insert-extract-base-cost",
57     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
58 
59 // Reserve a list of X# registers, so they are unavailable for register
60 // allocator, but can still be used as ABI requests, such as passing arguments
61 // to function call.
62 static cl::list<std::string>
63 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
64                   "registers, so they can't be used by register allocator. "
65                   "Should only be used for testing register allocator."),
66                   cl::CommaSeparated, cl::Hidden);
67 
68 static cl::opt<AArch64PAuth::AuthCheckMethod>
69     AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
70                                cl::Hidden,
71                                cl::desc("Override the variant of check applied "
72                                         "to authenticated LR during tail call"),
73                                cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR));
74 
75 static cl::opt<unsigned> AArch64MinimumJumpTableEntries(
76     "aarch64-min-jump-table-entries", cl::init(10), cl::Hidden,
77     cl::desc("Set minimum number of entries to use a jump table on AArch64"));
78 
79 static cl::opt<unsigned> AArch64StreamingHazardSize(
80     "aarch64-streaming-hazard-size",
81     cl::desc("Hazard size for streaming mode memory accesses. 0 = disabled."),
82     cl::init(0), cl::Hidden);
83 
84 static cl::alias AArch64StreamingStackHazardSize(
85     "aarch64-stack-hazard-size",
86     cl::desc("alias for -aarch64-streaming-hazard-size"),
87     cl::aliasopt(AArch64StreamingHazardSize));
88 
89 static cl::opt<bool> EnableZPRPredicateSpills(
90     "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
91     cl::desc(
92         "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
93 
94 // Subreg liveness tracking is disabled by default for now until all issues
95 // are ironed out. This option allows the feature to be used in tests.
96 static cl::opt<bool>
97     EnableSubregLivenessTracking("aarch64-enable-subreg-liveness-tracking",
98                                  cl::init(false), cl::Hidden,
99                                  cl::desc("Enable subreg liveness tracking"));
100 
101 static cl::opt<bool>
102     UseScalarIncVL("sve-use-scalar-inc-vl", cl::init(false), cl::Hidden,
103                    cl::desc("Prefer add+cnt over addvl/inc/dec"));
104 
getVectorInsertExtractBaseCost() const105 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
106   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
107     return OverrideVectorInsertExtractBaseCost;
108   return VectorInsertExtractBaseCost;
109 }
110 
initializeSubtargetDependencies(StringRef FS,StringRef CPUString,StringRef TuneCPUString,bool HasMinSize)111 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
112     StringRef FS, StringRef CPUString, StringRef TuneCPUString,
113     bool HasMinSize) {
114   // Determine default and user-specified characteristics
115 
116   if (CPUString.empty())
117     CPUString = "generic";
118 
119   if (TuneCPUString.empty())
120     TuneCPUString = CPUString;
121 
122   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
123   initializeProperties(HasMinSize);
124 
125   return *this;
126 }
127 
initializeProperties(bool HasMinSize)128 void AArch64Subtarget::initializeProperties(bool HasMinSize) {
129   // Initialize CPU specific properties. We should add a tablegen feature for
130   // this in the future so we can specify it together with the subtarget
131   // features.
132   switch (ARMProcFamily) {
133   case Generic:
134     // Using TuneCPU=generic we avoid ldapur instructions to line up with the
135     // cpus that use the AvoidLDAPUR feature. We don't want this to be on
136     // forever, so it is enabled between armv8.4 and armv8.7/armv9.2.
137     if (hasV8_4aOps() && !hasV8_8aOps())
138       AvoidLDAPUR = true;
139     break;
140   case Carmel:
141     CacheLineSize = 64;
142     break;
143   case CortexA35:
144   case CortexA53:
145   case CortexA55:
146   case CortexR82:
147   case CortexR82AE:
148     PrefFunctionAlignment = Align(16);
149     PrefLoopAlignment = Align(16);
150     MaxBytesForLoopAlignment = 8;
151     break;
152   case CortexA57:
153     MaxInterleaveFactor = 4;
154     PrefFunctionAlignment = Align(16);
155     PrefLoopAlignment = Align(16);
156     MaxBytesForLoopAlignment = 8;
157     break;
158   case CortexA65:
159     PrefFunctionAlignment = Align(8);
160     break;
161   case CortexA72:
162   case CortexA73:
163   case CortexA75:
164     PrefFunctionAlignment = Align(16);
165     PrefLoopAlignment = Align(16);
166     MaxBytesForLoopAlignment = 8;
167     break;
168   case CortexA76:
169   case CortexA77:
170   case CortexA78:
171   case CortexA78AE:
172   case CortexA78C:
173   case CortexX1:
174     PrefFunctionAlignment = Align(16);
175     PrefLoopAlignment = Align(32);
176     MaxBytesForLoopAlignment = 16;
177     break;
178   case CortexA320:
179   case CortexA510:
180   case CortexA520:
181     PrefFunctionAlignment = Align(16);
182     VScaleForTuning = 1;
183     PrefLoopAlignment = Align(16);
184     MaxBytesForLoopAlignment = 8;
185     break;
186   case CortexA710:
187   case CortexA715:
188   case CortexA720:
189   case CortexA725:
190   case CortexX2:
191   case CortexX3:
192   case CortexX4:
193   case CortexX925:
194     PrefFunctionAlignment = Align(16);
195     VScaleForTuning = 1;
196     PrefLoopAlignment = Align(32);
197     MaxBytesForLoopAlignment = 16;
198     break;
199   case A64FX:
200     CacheLineSize = 256;
201     PrefFunctionAlignment = Align(8);
202     PrefLoopAlignment = Align(4);
203     MaxInterleaveFactor = 4;
204     PrefetchDistance = 128;
205     MinPrefetchStride = 1024;
206     MaxPrefetchIterationsAhead = 4;
207     VScaleForTuning = 4;
208     break;
209   case MONAKA:
210     VScaleForTuning = 2;
211     break;
212   case AppleA7:
213   case AppleA10:
214   case AppleA11:
215   case AppleA12:
216   case AppleA13:
217   case AppleA14:
218   case AppleA15:
219   case AppleA16:
220   case AppleA17:
221   case AppleM4:
222     CacheLineSize = 64;
223     PrefetchDistance = 280;
224     MinPrefetchStride = 2048;
225     MaxPrefetchIterationsAhead = 3;
226     switch (ARMProcFamily) {
227     case AppleA14:
228     case AppleA15:
229     case AppleA16:
230     case AppleA17:
231     case AppleM4:
232       MaxInterleaveFactor = 4;
233       break;
234     default:
235       break;
236     }
237     break;
238   case ExynosM3:
239     MaxInterleaveFactor = 4;
240     MaxJumpTableSize = 20;
241     PrefFunctionAlignment = Align(32);
242     PrefLoopAlignment = Align(16);
243     break;
244   case Falkor:
245     MaxInterleaveFactor = 4;
246     // FIXME: remove this to enable 64-bit SLP if performance looks good.
247     MinVectorRegisterBitWidth = 128;
248     CacheLineSize = 128;
249     PrefetchDistance = 820;
250     MinPrefetchStride = 2048;
251     MaxPrefetchIterationsAhead = 8;
252     break;
253   case Kryo:
254     MaxInterleaveFactor = 4;
255     VectorInsertExtractBaseCost = 2;
256     CacheLineSize = 128;
257     PrefetchDistance = 740;
258     MinPrefetchStride = 1024;
259     MaxPrefetchIterationsAhead = 11;
260     // FIXME: remove this to enable 64-bit SLP if performance looks good.
261     MinVectorRegisterBitWidth = 128;
262     break;
263   case NeoverseE1:
264     PrefFunctionAlignment = Align(8);
265     break;
266   case NeoverseN1:
267     PrefFunctionAlignment = Align(16);
268     PrefLoopAlignment = Align(32);
269     MaxBytesForLoopAlignment = 16;
270     break;
271   case NeoverseV2:
272   case NeoverseV3:
273     CacheLineSize = 64;
274     EpilogueVectorizationMinVF = 8;
275     MaxInterleaveFactor = 4;
276     ScatterOverhead = 13;
277     LLVM_FALLTHROUGH;
278   case NeoverseN2:
279   case NeoverseN3:
280     PrefFunctionAlignment = Align(16);
281     PrefLoopAlignment = Align(32);
282     MaxBytesForLoopAlignment = 16;
283     VScaleForTuning = 1;
284     break;
285   case NeoverseV1:
286     PrefFunctionAlignment = Align(16);
287     PrefLoopAlignment = Align(32);
288     MaxBytesForLoopAlignment = 16;
289     VScaleForTuning = 2;
290     DefaultSVETFOpts = TailFoldingOpts::Simple;
291     break;
292   case Neoverse512TVB:
293     PrefFunctionAlignment = Align(16);
294     VScaleForTuning = 1;
295     MaxInterleaveFactor = 4;
296     break;
297   case Saphira:
298     MaxInterleaveFactor = 4;
299     // FIXME: remove this to enable 64-bit SLP if performance looks good.
300     MinVectorRegisterBitWidth = 128;
301     break;
302   case ThunderX2T99:
303     CacheLineSize = 64;
304     PrefFunctionAlignment = Align(8);
305     PrefLoopAlignment = Align(4);
306     MaxInterleaveFactor = 4;
307     PrefetchDistance = 128;
308     MinPrefetchStride = 1024;
309     MaxPrefetchIterationsAhead = 4;
310     // FIXME: remove this to enable 64-bit SLP if performance looks good.
311     MinVectorRegisterBitWidth = 128;
312     break;
313   case ThunderX:
314   case ThunderXT88:
315   case ThunderXT81:
316   case ThunderXT83:
317     CacheLineSize = 128;
318     PrefFunctionAlignment = Align(8);
319     PrefLoopAlignment = Align(4);
320     // FIXME: remove this to enable 64-bit SLP if performance looks good.
321     MinVectorRegisterBitWidth = 128;
322     break;
323   case TSV110:
324     CacheLineSize = 64;
325     PrefFunctionAlignment = Align(16);
326     PrefLoopAlignment = Align(4);
327     break;
328   case ThunderX3T110:
329     CacheLineSize = 64;
330     PrefFunctionAlignment = Align(16);
331     PrefLoopAlignment = Align(4);
332     MaxInterleaveFactor = 4;
333     PrefetchDistance = 128;
334     MinPrefetchStride = 1024;
335     MaxPrefetchIterationsAhead = 4;
336     // FIXME: remove this to enable 64-bit SLP if performance looks good.
337     MinVectorRegisterBitWidth = 128;
338     break;
339   case Ampere1:
340   case Ampere1A:
341   case Ampere1B:
342     CacheLineSize = 64;
343     PrefFunctionAlignment = Align(64);
344     PrefLoopAlignment = Align(64);
345     MaxInterleaveFactor = 4;
346     break;
347   case Oryon:
348     CacheLineSize = 64;
349     PrefFunctionAlignment = Align(16);
350     MaxInterleaveFactor = 4;
351     PrefetchDistance = 128;
352     MinPrefetchStride = 1024;
353     break;
354   case Olympus:
355     EpilogueVectorizationMinVF = 8;
356     MaxInterleaveFactor = 4;
357     ScatterOverhead = 13;
358     PrefFunctionAlignment = Align(16);
359     PrefLoopAlignment = Align(32);
360     MaxBytesForLoopAlignment = 16;
361     VScaleForTuning = 1;
362     break;
363   }
364 
365   if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize)
366     MinimumJumpTableEntries = AArch64MinimumJumpTableEntries;
367 }
368 
AArch64Subtarget(const Triple & TT,StringRef CPU,StringRef TuneCPU,StringRef FS,const TargetMachine & TM,bool LittleEndian,unsigned MinSVEVectorSizeInBitsOverride,unsigned MaxSVEVectorSizeInBitsOverride,bool IsStreaming,bool IsStreamingCompatible,bool HasMinSize)369 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
370                                    StringRef TuneCPU, StringRef FS,
371                                    const TargetMachine &TM, bool LittleEndian,
372                                    unsigned MinSVEVectorSizeInBitsOverride,
373                                    unsigned MaxSVEVectorSizeInBitsOverride,
374                                    bool IsStreaming, bool IsStreamingCompatible,
375                                    bool HasMinSize)
376     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
377       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
378       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
379       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
380       IsLittle(LittleEndian), IsStreaming(IsStreaming),
381       IsStreamingCompatible(IsStreamingCompatible),
382       StreamingHazardSize(
383           AArch64StreamingHazardSize.getNumOccurrences() > 0
384               ? std::optional<unsigned>(AArch64StreamingHazardSize)
385               : std::nullopt),
386       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
387       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
388       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
389       TLInfo(TM, *this) {
390   if (AArch64::isX18ReservedByDefault(TT))
391     ReserveXRegister.set(18);
392 
393   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
394   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
395   Legalizer.reset(new AArch64LegalizerInfo(*this));
396 
397   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
398 
399   // FIXME: At this point, we can't rely on Subtarget having RBI.
400   // It's awkward to mix passing RBI and the Subtarget; should we pass
401   // TII/TRI as well?
402   InstSelector.reset(createAArch64InstructionSelector(
403       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
404 
405   RegBankInfo.reset(RBI);
406 
407   auto TRI = getRegisterInfo();
408   StringSet<> ReservedRegNames(llvm::from_range, ReservedRegsForRA);
409   for (unsigned i = 0; i < 29; ++i) {
410     if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
411       ReserveXRegisterForRA.set(i);
412   }
413   // X30 is named LR, so we can't use TRI->getName to check X30.
414   if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
415     ReserveXRegisterForRA.set(30);
416   // X29 is named FP, so we can't use TRI->getName to check X29.
417   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
418     ReserveXRegisterForRA.set(29);
419 
420   EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
421 }
422 
getHwModeSet() const423 unsigned AArch64Subtarget::getHwModeSet() const {
424   AArch64HwModeBits Modes = AArch64HwModeBits::DefaultMode;
425 
426   // Use a special hardware mode in streaming[-compatible] functions with
427   // aarch64-enable-zpr-predicate-spills. This changes the spill size (and
428   // alignment) for the predicate register class.
429   if (EnableZPRPredicateSpills.getValue() &&
430       (isStreaming() || isStreamingCompatible())) {
431     Modes |= AArch64HwModeBits::SMEWithZPRPredicateSpills;
432   }
433 
434   return to_underlying(Modes);
435 }
436 
getCallLowering() const437 const CallLowering *AArch64Subtarget::getCallLowering() const {
438   return CallLoweringInfo.get();
439 }
440 
getInlineAsmLowering() const441 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
442   return InlineAsmLoweringInfo.get();
443 }
444 
getInstructionSelector() const445 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
446   return InstSelector.get();
447 }
448 
getLegalizerInfo() const449 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
450   return Legalizer.get();
451 }
452 
getRegBankInfo() const453 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
454   return RegBankInfo.get();
455 }
456 
457 /// Find the target operand flags that describe how a global value should be
458 /// referenced for the current subtarget.
459 unsigned
ClassifyGlobalReference(const GlobalValue * GV,const TargetMachine & TM) const460 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
461                                           const TargetMachine &TM) const {
462   // MachO large model always goes via a GOT, simply to get a single 8-byte
463   // absolute relocation on all global addresses.
464   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
465     return AArch64II::MO_GOT;
466 
467   // All globals dynamically protected by MTE must have their address tags
468   // synthesized. This is done by having the loader stash the tag in the GOT
469   // entry. Force all tagged globals (even ones with internal linkage) through
470   // the GOT.
471   if (GV->isTagged())
472     return AArch64II::MO_GOT;
473 
474   if (!TM.shouldAssumeDSOLocal(GV)) {
475     if (GV->hasDLLImportStorageClass()) {
476       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
477     }
478     if (getTargetTriple().isOSWindows())
479       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
480     return AArch64II::MO_GOT;
481   }
482 
483   // The small code model's direct accesses use ADRP, which cannot
484   // necessarily produce the value 0 (if the code is above 4GB).
485   // Same for the tiny code model, where we have a pc relative LDR.
486   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
487       GV->hasExternalWeakLinkage())
488     return AArch64II::MO_GOT;
489 
490   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
491   // that their nominal addresses are tagged and outside of the code model. In
492   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
493   // tag if necessary based on MO_TAGGED.
494   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
495     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
496 
497   return AArch64II::MO_NO_FLAG;
498 }
499 
classifyGlobalFunctionReference(const GlobalValue * GV,const TargetMachine & TM) const500 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
501     const GlobalValue *GV, const TargetMachine &TM) const {
502   // MachO large model always goes via a GOT, because we don't have the
503   // relocations available to do anything else..
504   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
505       !GV->hasInternalLinkage())
506     return AArch64II::MO_GOT;
507 
508   // NonLazyBind goes via GOT unless we know it's available locally.
509   auto *F = dyn_cast<Function>(GV);
510   if ((!isTargetMachO() || MachOUseNonLazyBind) && F &&
511       F->hasFnAttribute(Attribute::NonLazyBind) && !TM.shouldAssumeDSOLocal(GV))
512     return AArch64II::MO_GOT;
513 
514   if (getTargetTriple().isOSWindows()) {
515     if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) {
516       if (GV->hasDLLImportStorageClass()) {
517         // On Arm64EC, if we're calling a symbol from the import table
518         // directly, use MO_ARM64EC_CALLMANGLE.
519         return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT |
520                AArch64II::MO_ARM64EC_CALLMANGLE;
521       }
522       if (GV->hasExternalLinkage()) {
523         // If we're calling a symbol directly, use the mangled form in the
524         // call instruction.
525         return AArch64II::MO_ARM64EC_CALLMANGLE;
526       }
527     }
528 
529     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
530     return ClassifyGlobalReference(GV, TM);
531   }
532 
533   return AArch64II::MO_NO_FLAG;
534 }
535 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const536 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
537                                            unsigned NumRegionInstrs) const {
538   // LNT run (at least on Cyclone) showed reasonably significant gains for
539   // bi-directional scheduling. 253.perlbmk.
540   Policy.OnlyTopDown = false;
541   Policy.OnlyBottomUp = false;
542   // Enabling or Disabling the latency heuristic is a close call: It seems to
543   // help nearly no benchmark on out-of-order architectures, on the other hand
544   // it regresses register pressure on a few benchmarking.
545   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
546 }
547 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep,const TargetSchedModel * SchedModel) const548 void AArch64Subtarget::adjustSchedDependency(
549     SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
550     const TargetSchedModel *SchedModel) const {
551   if (!SchedModel || Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
552       !Def->isInstr() || !Use->isInstr() ||
553       (Def->getInstr()->getOpcode() != TargetOpcode::BUNDLE &&
554        Use->getInstr()->getOpcode() != TargetOpcode::BUNDLE))
555     return;
556 
557   // If the Def is a BUNDLE, find the last instruction in the bundle that defs
558   // the register.
559   const MachineInstr *DefMI = Def->getInstr();
560   if (DefMI->getOpcode() == TargetOpcode::BUNDLE) {
561     Register Reg = DefMI->getOperand(DefOpIdx).getReg();
562     for (const auto &Op : const_mi_bundle_ops(*DefMI)) {
563       if (Op.isReg() && Op.isDef() && Op.getReg() == Reg) {
564         DefMI = Op.getParent();
565         DefOpIdx = Op.getOperandNo();
566       }
567     }
568   }
569 
570   // If the Use is a BUNDLE, find the first instruction that uses the Reg.
571   const MachineInstr *UseMI = Use->getInstr();
572   if (UseMI->getOpcode() == TargetOpcode::BUNDLE) {
573     Register Reg = UseMI->getOperand(UseOpIdx).getReg();
574     for (const auto &Op : const_mi_bundle_ops(*UseMI)) {
575       if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) {
576         UseMI = Op.getParent();
577         UseOpIdx = Op.getOperandNo();
578         break;
579       }
580     }
581   }
582 
583   Dep.setLatency(
584       SchedModel->computeOperandLatency(DefMI, DefOpIdx, UseMI, UseOpIdx));
585 }
586 
enableEarlyIfConversion() const587 bool AArch64Subtarget::enableEarlyIfConversion() const {
588   return EnableEarlyIfConvert;
589 }
590 
supportsAddressTopByteIgnored() const591 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
592   if (!UseAddressTopByteIgnored)
593     return false;
594 
595   if (TargetTriple.isDriverKit())
596     return true;
597   if (TargetTriple.isiOS()) {
598     return TargetTriple.getiOSVersion() >= VersionTuple(8);
599   }
600 
601   return false;
602 }
603 
604 std::unique_ptr<PBQPRAConstraint>
getCustomPBQPConstraints() const605 AArch64Subtarget::getCustomPBQPConstraints() const {
606   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
607 }
608 
mirFileLoaded(MachineFunction & MF) const609 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
610   // We usually compute max call frame size after ISel. Do the computation now
611   // if the .mir file didn't specify it. Note that this will probably give you
612   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
613   // instructions, specify explicitly if you need it to be correct.
614   MachineFrameInfo &MFI = MF.getFrameInfo();
615   if (!MFI.isMaxCallFrameSizeComputed())
616     MFI.computeMaxCallFrameSize(MF);
617 }
618 
useAA() const619 bool AArch64Subtarget::useAA() const { return UseAA; }
620 
useScalarIncVL() const621 bool AArch64Subtarget::useScalarIncVL() const {
622   // If SVE2 or SME is present (we are not SVE-1 only) and UseScalarIncVL
623   // is not otherwise set, enable it by default.
624   if (UseScalarIncVL.getNumOccurrences())
625     return UseScalarIncVL;
626   return hasSVE2() || hasSME();
627 }
628 
629 // If return address signing is enabled, tail calls are emitted as follows:
630 //
631 // ```
632 //   <authenticate LR>
633 //   <check LR>
634 //   TCRETURN          ; the callee may sign and spill the LR in its prologue
635 // ```
636 //
637 // LR may require explicit checking because if FEAT_FPAC is not implemented
638 // and LR was tampered with, then `<authenticate LR>` will not generate an
639 // exception on its own. Later, if the callee spills the signed LR value and
640 // neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
641 // the higher bits of LR thus hiding the authentication failure.
getAuthenticatedLRCheckMethod(const MachineFunction & MF) const642 AArch64PAuth::AuthCheckMethod AArch64Subtarget::getAuthenticatedLRCheckMethod(
643     const MachineFunction &MF) const {
644   // TODO: Check subtarget for the scheme. Present variant is a default for
645   // pauthtest ABI.
646   if (MF.getFunction().hasFnAttribute("ptrauth-returns") &&
647       MF.getFunction().hasFnAttribute("ptrauth-auth-traps"))
648     return AArch64PAuth::AuthCheckMethod::HighBitsNoTBI;
649   if (AuthenticatedLRCheckMethod.getNumOccurrences())
650     return AuthenticatedLRCheckMethod;
651 
652   // At now, use None by default because checks may introduce an unexpected
653   // performance regression or incompatibility with execute-only mappings.
654   return AArch64PAuth::AuthCheckMethod::None;
655 }
656 
657 std::optional<uint16_t>
getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function & ParentFn) const658 AArch64Subtarget::getPtrAuthBlockAddressDiscriminatorIfEnabled(
659     const Function &ParentFn) const {
660   if (!ParentFn.hasFnAttribute("ptrauth-indirect-gotos"))
661     return std::nullopt;
662   // We currently have one simple mechanism for all targets.
663   // This isn't ABI, so we can always do better in the future.
664   return getPointerAuthStableSipHash(
665       (Twine(ParentFn.getName()) + " blockaddress").str());
666 }
667 
isX16X17Safer() const668 bool AArch64Subtarget::isX16X17Safer() const {
669   // The Darwin kernel implements special protections for x16 and x17 so we
670   // should prefer to use those registers on that platform.
671   return isTargetDarwin();
672 }
673 
enableMachinePipeliner() const674 bool AArch64Subtarget::enableMachinePipeliner() const {
675   return getSchedModel().hasInstrSchedModel();
676 }
677