xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp (revision 43e29d03f416d7dda52112a29600a7c82ee1a91e)
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/IR/GlobalValue.h"
27 #include "llvm/Support/AArch64TargetParser.h"
28 #include "llvm/Support/TargetParser.h"
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "aarch64-subtarget"
33 
34 #define GET_SUBTARGETINFO_CTOR
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #include "AArch64GenSubtargetInfo.inc"
37 
38 static cl::opt<bool>
39 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
40                      "converter pass"), cl::init(true), cl::Hidden);
41 
42 // If OS supports TBI, use this flag to enable it.
43 static cl::opt<bool>
44 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
45                          "an address is ignored"), cl::init(false), cl::Hidden);
46 
47 static cl::opt<bool>
48     UseNonLazyBind("aarch64-enable-nonlazybind",
49                    cl::desc("Call nonlazybind functions via direct GOT load"),
50                    cl::init(false), cl::Hidden);
51 
52 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
53                            cl::desc("Enable the use of AA during codegen."));
54 
55 static cl::opt<unsigned> OverrideVectorInsertExtractBaseCost(
56     "aarch64-insert-extract-base-cost",
57     cl::desc("Base cost of vector insert/extract element"), cl::Hidden);
58 
59 // Reserve a list of X# registers, so they are unavailable for register
60 // allocator, but can still be used as ABI requests, such as passing arguments
61 // to function call.
62 static cl::list<std::string>
63 ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
64                   "registers, so they can't be used by register allocator. "
65                   "Should only be used for testing register allocator."),
66                   cl::CommaSeparated, cl::Hidden);
67 
68 static cl::opt<bool>
69     ForceStreamingCompatibleSVE("force-streaming-compatible-sve",
70                                 cl::init(false), cl::Hidden);
71 
72 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
73   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
74     return OverrideVectorInsertExtractBaseCost;
75   return VectorInsertExtractBaseCost;
76 }
77 
78 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
79     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
80   // Determine default and user-specified characteristics
81 
82   if (CPUString.empty())
83     CPUString = "generic";
84 
85   if (TuneCPUString.empty())
86     TuneCPUString = CPUString;
87 
88   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
89   initializeProperties();
90 
91   return *this;
92 }
93 
94 void AArch64Subtarget::initializeProperties() {
95   // Initialize CPU specific properties. We should add a tablegen feature for
96   // this in the future so we can specify it together with the subtarget
97   // features.
98   switch (ARMProcFamily) {
99   case Others:
100     break;
101   case Carmel:
102     CacheLineSize = 64;
103     break;
104   case CortexA35:
105   case CortexA53:
106   case CortexA55:
107     PrefFunctionLogAlignment = 4;
108     PrefLoopLogAlignment = 4;
109     MaxBytesForLoopAlignment = 8;
110     break;
111   case CortexA57:
112     MaxInterleaveFactor = 4;
113     PrefFunctionLogAlignment = 4;
114     PrefLoopLogAlignment = 4;
115     MaxBytesForLoopAlignment = 8;
116     break;
117   case CortexA65:
118     PrefFunctionLogAlignment = 3;
119     break;
120   case CortexA72:
121   case CortexA73:
122   case CortexA75:
123     PrefFunctionLogAlignment = 4;
124     PrefLoopLogAlignment = 4;
125     MaxBytesForLoopAlignment = 8;
126     break;
127   case CortexA76:
128   case CortexA77:
129   case CortexA78:
130   case CortexA78C:
131   case CortexR82:
132   case CortexX1:
133   case CortexX1C:
134     PrefFunctionLogAlignment = 4;
135     PrefLoopLogAlignment = 5;
136     MaxBytesForLoopAlignment = 16;
137     break;
138   case CortexA510:
139     PrefFunctionLogAlignment = 4;
140     VScaleForTuning = 1;
141     PrefLoopLogAlignment = 4;
142     MaxBytesForLoopAlignment = 8;
143     break;
144   case CortexA710:
145   case CortexA715:
146   case CortexX2:
147   case CortexX3:
148     PrefFunctionLogAlignment = 4;
149     VScaleForTuning = 1;
150     PrefLoopLogAlignment = 5;
151     MaxBytesForLoopAlignment = 16;
152     break;
153   case A64FX:
154     CacheLineSize = 256;
155     PrefFunctionLogAlignment = 3;
156     PrefLoopLogAlignment = 2;
157     MaxInterleaveFactor = 4;
158     PrefetchDistance = 128;
159     MinPrefetchStride = 1024;
160     MaxPrefetchIterationsAhead = 4;
161     VScaleForTuning = 4;
162     break;
163   case AppleA7:
164   case AppleA10:
165   case AppleA11:
166   case AppleA12:
167   case AppleA13:
168   case AppleA14:
169   case AppleA15:
170   case AppleA16:
171     CacheLineSize = 64;
172     PrefetchDistance = 280;
173     MinPrefetchStride = 2048;
174     MaxPrefetchIterationsAhead = 3;
175     switch (ARMProcFamily) {
176     case AppleA14:
177     case AppleA15:
178     case AppleA16:
179       MaxInterleaveFactor = 4;
180       break;
181     default:
182       break;
183     }
184     break;
185   case ExynosM3:
186     MaxInterleaveFactor = 4;
187     MaxJumpTableSize = 20;
188     PrefFunctionLogAlignment = 5;
189     PrefLoopLogAlignment = 4;
190     break;
191   case Falkor:
192     MaxInterleaveFactor = 4;
193     // FIXME: remove this to enable 64-bit SLP if performance looks good.
194     MinVectorRegisterBitWidth = 128;
195     CacheLineSize = 128;
196     PrefetchDistance = 820;
197     MinPrefetchStride = 2048;
198     MaxPrefetchIterationsAhead = 8;
199     break;
200   case Kryo:
201     MaxInterleaveFactor = 4;
202     VectorInsertExtractBaseCost = 2;
203     CacheLineSize = 128;
204     PrefetchDistance = 740;
205     MinPrefetchStride = 1024;
206     MaxPrefetchIterationsAhead = 11;
207     // FIXME: remove this to enable 64-bit SLP if performance looks good.
208     MinVectorRegisterBitWidth = 128;
209     break;
210   case NeoverseE1:
211     PrefFunctionLogAlignment = 3;
212     break;
213   case NeoverseN1:
214     PrefFunctionLogAlignment = 4;
215     PrefLoopLogAlignment = 5;
216     MaxBytesForLoopAlignment = 16;
217     break;
218   case NeoverseN2:
219   case NeoverseV2:
220     PrefFunctionLogAlignment = 4;
221     PrefLoopLogAlignment = 5;
222     MaxBytesForLoopAlignment = 16;
223     VScaleForTuning = 1;
224     break;
225   case NeoverseV1:
226     PrefFunctionLogAlignment = 4;
227     PrefLoopLogAlignment = 5;
228     MaxBytesForLoopAlignment = 16;
229     VScaleForTuning = 2;
230     break;
231   case Neoverse512TVB:
232     PrefFunctionLogAlignment = 4;
233     VScaleForTuning = 1;
234     MaxInterleaveFactor = 4;
235     break;
236   case Saphira:
237     MaxInterleaveFactor = 4;
238     // FIXME: remove this to enable 64-bit SLP if performance looks good.
239     MinVectorRegisterBitWidth = 128;
240     break;
241   case ThunderX2T99:
242     CacheLineSize = 64;
243     PrefFunctionLogAlignment = 3;
244     PrefLoopLogAlignment = 2;
245     MaxInterleaveFactor = 4;
246     PrefetchDistance = 128;
247     MinPrefetchStride = 1024;
248     MaxPrefetchIterationsAhead = 4;
249     // FIXME: remove this to enable 64-bit SLP if performance looks good.
250     MinVectorRegisterBitWidth = 128;
251     break;
252   case ThunderX:
253   case ThunderXT88:
254   case ThunderXT81:
255   case ThunderXT83:
256     CacheLineSize = 128;
257     PrefFunctionLogAlignment = 3;
258     PrefLoopLogAlignment = 2;
259     // FIXME: remove this to enable 64-bit SLP if performance looks good.
260     MinVectorRegisterBitWidth = 128;
261     break;
262   case TSV110:
263     CacheLineSize = 64;
264     PrefFunctionLogAlignment = 4;
265     PrefLoopLogAlignment = 2;
266     break;
267   case ThunderX3T110:
268     CacheLineSize = 64;
269     PrefFunctionLogAlignment = 4;
270     PrefLoopLogAlignment = 2;
271     MaxInterleaveFactor = 4;
272     PrefetchDistance = 128;
273     MinPrefetchStride = 1024;
274     MaxPrefetchIterationsAhead = 4;
275     // FIXME: remove this to enable 64-bit SLP if performance looks good.
276     MinVectorRegisterBitWidth = 128;
277     break;
278   case Ampere1:
279   case Ampere1A:
280     CacheLineSize = 64;
281     PrefFunctionLogAlignment = 6;
282     PrefLoopLogAlignment = 6;
283     MaxInterleaveFactor = 4;
284     break;
285   }
286 }
287 
288 AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
289                                    StringRef TuneCPU, StringRef FS,
290                                    const TargetMachine &TM, bool LittleEndian,
291                                    unsigned MinSVEVectorSizeInBitsOverride,
292                                    unsigned MaxSVEVectorSizeInBitsOverride,
293                                    bool StreamingSVEModeDisabled)
294     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
295       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
296       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
297       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
298       IsLittle(LittleEndian),
299       StreamingSVEModeDisabled(StreamingSVEModeDisabled),
300       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
301       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
302       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
303       TLInfo(TM, *this) {
304   if (AArch64::isX18ReservedByDefault(TT))
305     ReserveXRegister.set(18);
306 
307   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
308   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
309   Legalizer.reset(new AArch64LegalizerInfo(*this));
310 
311   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
312 
313   // FIXME: At this point, we can't rely on Subtarget having RBI.
314   // It's awkward to mix passing RBI and the Subtarget; should we pass
315   // TII/TRI as well?
316   InstSelector.reset(createAArch64InstructionSelector(
317       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
318 
319   RegBankInfo.reset(RBI);
320 
321   auto TRI = getRegisterInfo();
322   StringSet<> ReservedRegNames;
323   ReservedRegNames.insert(ReservedRegsForRA.begin(), ReservedRegsForRA.end());
324   for (unsigned i = 0; i < 29; ++i) {
325     if (ReservedRegNames.count(TRI->getName(AArch64::X0 + i)))
326       ReserveXRegisterForRA.set(i);
327   }
328   // X30 is named LR, so we can't use TRI->getName to check X30.
329   if (ReservedRegNames.count("X30") || ReservedRegNames.count("LR"))
330     ReserveXRegisterForRA.set(30);
331   // X29 is named FP, so we can't use TRI->getName to check X29.
332   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
333     ReserveXRegisterForRA.set(29);
334 }
335 
336 const CallLowering *AArch64Subtarget::getCallLowering() const {
337   return CallLoweringInfo.get();
338 }
339 
340 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
341   return InlineAsmLoweringInfo.get();
342 }
343 
344 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
345   return InstSelector.get();
346 }
347 
348 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
349   return Legalizer.get();
350 }
351 
352 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
353   return RegBankInfo.get();
354 }
355 
356 /// Find the target operand flags that describe how a global value should be
357 /// referenced for the current subtarget.
358 unsigned
359 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
360                                           const TargetMachine &TM) const {
361   // MachO large model always goes via a GOT, simply to get a single 8-byte
362   // absolute relocation on all global addresses.
363   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
364     return AArch64II::MO_GOT;
365 
366   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
367     if (GV->hasDLLImportStorageClass()) {
368       if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy())
369         return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX;
370       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
371     }
372     if (getTargetTriple().isOSWindows())
373       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
374     return AArch64II::MO_GOT;
375   }
376 
377   // The small code model's direct accesses use ADRP, which cannot
378   // necessarily produce the value 0 (if the code is above 4GB).
379   // Same for the tiny code model, where we have a pc relative LDR.
380   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
381       GV->hasExternalWeakLinkage())
382     return AArch64II::MO_GOT;
383 
384   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
385   // that their nominal addresses are tagged and outside of the code model. In
386   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
387   // tag if necessary based on MO_TAGGED.
388   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
389     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
390 
391   return AArch64II::MO_NO_FLAG;
392 }
393 
394 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
395     const GlobalValue *GV, const TargetMachine &TM) const {
396   // MachO large model always goes via a GOT, because we don't have the
397   // relocations available to do anything else..
398   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
399       !GV->hasInternalLinkage())
400     return AArch64II::MO_GOT;
401 
402   // NonLazyBind goes via GOT unless we know it's available locally.
403   auto *F = dyn_cast<Function>(GV);
404   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
405       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
406     return AArch64II::MO_GOT;
407 
408   if (getTargetTriple().isOSWindows()) {
409     if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() &&
410         GV->hasDLLImportStorageClass()) {
411       // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT,
412       // not MO_DLLIMPORTAUX.
413       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
414     }
415 
416     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
417     return ClassifyGlobalReference(GV, TM);
418   }
419 
420   return AArch64II::MO_NO_FLAG;
421 }
422 
423 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
424                                            unsigned NumRegionInstrs) const {
425   // LNT run (at least on Cyclone) showed reasonably significant gains for
426   // bi-directional scheduling. 253.perlbmk.
427   Policy.OnlyTopDown = false;
428   Policy.OnlyBottomUp = false;
429   // Enabling or Disabling the latency heuristic is a close call: It seems to
430   // help nearly no benchmark on out-of-order architectures, on the other hand
431   // it regresses register pressure on a few benchmarking.
432   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
433 }
434 
435 bool AArch64Subtarget::enableEarlyIfConversion() const {
436   return EnableEarlyIfConvert;
437 }
438 
439 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
440   if (!UseAddressTopByteIgnored)
441     return false;
442 
443   if (TargetTriple.isDriverKit())
444     return true;
445   if (TargetTriple.isiOS()) {
446     return TargetTriple.getiOSVersion() >= VersionTuple(8);
447   }
448 
449   return false;
450 }
451 
452 std::unique_ptr<PBQPRAConstraint>
453 AArch64Subtarget::getCustomPBQPConstraints() const {
454   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
455 }
456 
457 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
458   // We usually compute max call frame size after ISel. Do the computation now
459   // if the .mir file didn't specify it. Note that this will probably give you
460   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
461   // instructions, specify explicitly if you need it to be correct.
462   MachineFrameInfo &MFI = MF.getFrameInfo();
463   if (!MFI.isMaxCallFrameSizeComputed())
464     MFI.computeMaxCallFrameSize(MF);
465 }
466 
467 bool AArch64Subtarget::useAA() const { return UseAA; }
468 
469 bool AArch64Subtarget::forceStreamingCompatibleSVE() const {
470   if (ForceStreamingCompatibleSVE) {
471     assert(hasSVEorSME() && "Expected SVE to be available");
472     return hasSVEorSME();
473   }
474   return false;
475 }
476