xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp (revision 6be3386466ab79a84b48429ae66244f21526d3df)
1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64Subtarget.h"
14 
15 #include "AArch64.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/TargetParser.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "aarch64-subtarget"
31 
32 #define GET_SUBTARGETINFO_CTOR
33 #define GET_SUBTARGETINFO_TARGET_DESC
34 #include "AArch64GenSubtargetInfo.inc"
35 
36 static cl::opt<bool>
37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
38                      "converter pass"), cl::init(true), cl::Hidden);
39 
40 // If OS supports TBI, use this flag to enable it.
41 static cl::opt<bool>
42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
43                          "an address is ignored"), cl::init(false), cl::Hidden);
44 
45 static cl::opt<bool>
46     UseNonLazyBind("aarch64-enable-nonlazybind",
47                    cl::desc("Call nonlazybind functions via direct GOT load"),
48                    cl::init(false), cl::Hidden);
49 
50 static cl::opt<unsigned> SVEVectorBitsMax(
51     "aarch64-sve-vector-bits-max",
52     cl::desc("Assume SVE vector registers are at most this big, "
53              "with zero meaning no maximum size is assumed."),
54     cl::init(0), cl::Hidden);
55 
56 static cl::opt<unsigned> SVEVectorBitsMin(
57     "aarch64-sve-vector-bits-min",
58     cl::desc("Assume SVE vector registers are at least this big, "
59              "with zero meaning no minimum size is assumed."),
60     cl::init(0), cl::Hidden);
61 
62 AArch64Subtarget &
63 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
64                                                   StringRef CPUString) {
65   // Determine default and user-specified characteristics
66 
67   if (CPUString.empty())
68     CPUString = "generic";
69 
70   ParseSubtargetFeatures(CPUString, FS);
71   initializeProperties();
72 
73   return *this;
74 }
75 
76 void AArch64Subtarget::initializeProperties() {
77   // Initialize CPU specific properties. We should add a tablegen feature for
78   // this in the future so we can specify it together with the subtarget
79   // features.
80   switch (ARMProcFamily) {
81   case Others:
82     break;
83   case Carmel:
84     CacheLineSize = 64;
85     break;
86   case CortexA35:
87     break;
88   case CortexA53:
89     PrefFunctionLogAlignment = 3;
90     break;
91   case CortexA55:
92     break;
93   case CortexA57:
94     MaxInterleaveFactor = 4;
95     PrefFunctionLogAlignment = 4;
96     break;
97   case CortexA65:
98     PrefFunctionLogAlignment = 3;
99     break;
100   case CortexA72:
101   case CortexA73:
102   case CortexA75:
103   case CortexA76:
104   case CortexA77:
105   case CortexA78:
106   case CortexX1:
107     PrefFunctionLogAlignment = 4;
108     break;
109   case A64FX:
110     CacheLineSize = 256;
111     PrefFunctionLogAlignment = 5;
112     PrefLoopLogAlignment = 5;
113     break;
114   case AppleA7:
115   case AppleA10:
116   case AppleA11:
117   case AppleA12:
118   case AppleA13:
119     CacheLineSize = 64;
120     PrefetchDistance = 280;
121     MinPrefetchStride = 2048;
122     MaxPrefetchIterationsAhead = 3;
123     break;
124   case ExynosM3:
125     MaxInterleaveFactor = 4;
126     MaxJumpTableSize = 20;
127     PrefFunctionLogAlignment = 5;
128     PrefLoopLogAlignment = 4;
129     break;
130   case Falkor:
131     MaxInterleaveFactor = 4;
132     // FIXME: remove this to enable 64-bit SLP if performance looks good.
133     MinVectorRegisterBitWidth = 128;
134     CacheLineSize = 128;
135     PrefetchDistance = 820;
136     MinPrefetchStride = 2048;
137     MaxPrefetchIterationsAhead = 8;
138     break;
139   case Kryo:
140     MaxInterleaveFactor = 4;
141     VectorInsertExtractBaseCost = 2;
142     CacheLineSize = 128;
143     PrefetchDistance = 740;
144     MinPrefetchStride = 1024;
145     MaxPrefetchIterationsAhead = 11;
146     // FIXME: remove this to enable 64-bit SLP if performance looks good.
147     MinVectorRegisterBitWidth = 128;
148     break;
149   case NeoverseE1:
150     PrefFunctionLogAlignment = 3;
151     break;
152   case NeoverseN1:
153     PrefFunctionLogAlignment = 4;
154     break;
155   case Saphira:
156     MaxInterleaveFactor = 4;
157     // FIXME: remove this to enable 64-bit SLP if performance looks good.
158     MinVectorRegisterBitWidth = 128;
159     break;
160   case ThunderX2T99:
161     CacheLineSize = 64;
162     PrefFunctionLogAlignment = 3;
163     PrefLoopLogAlignment = 2;
164     MaxInterleaveFactor = 4;
165     PrefetchDistance = 128;
166     MinPrefetchStride = 1024;
167     MaxPrefetchIterationsAhead = 4;
168     // FIXME: remove this to enable 64-bit SLP if performance looks good.
169     MinVectorRegisterBitWidth = 128;
170     break;
171   case ThunderX:
172   case ThunderXT88:
173   case ThunderXT81:
174   case ThunderXT83:
175     CacheLineSize = 128;
176     PrefFunctionLogAlignment = 3;
177     PrefLoopLogAlignment = 2;
178     // FIXME: remove this to enable 64-bit SLP if performance looks good.
179     MinVectorRegisterBitWidth = 128;
180     break;
181   case TSV110:
182     CacheLineSize = 64;
183     PrefFunctionLogAlignment = 4;
184     PrefLoopLogAlignment = 2;
185     break;
186   case ThunderX3T110:
187     CacheLineSize = 64;
188     PrefFunctionLogAlignment = 4;
189     PrefLoopLogAlignment = 2;
190     MaxInterleaveFactor = 4;
191     PrefetchDistance = 128;
192     MinPrefetchStride = 1024;
193     MaxPrefetchIterationsAhead = 4;
194     // FIXME: remove this to enable 64-bit SLP if performance looks good.
195     MinVectorRegisterBitWidth = 128;
196     break;
197   }
198 }
199 
200 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
201                                    const std::string &FS,
202                                    const TargetMachine &TM, bool LittleEndian)
203     : AArch64GenSubtargetInfo(TT, CPU, FS),
204       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
205       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
206       IsLittle(LittleEndian),
207       TargetTriple(TT), FrameLowering(),
208       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
209       TLInfo(TM, *this) {
210   if (AArch64::isX18ReservedByDefault(TT))
211     ReserveXRegister.set(18);
212 
213   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
214   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
215   Legalizer.reset(new AArch64LegalizerInfo(*this));
216 
217   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
218 
219   // FIXME: At this point, we can't rely on Subtarget having RBI.
220   // It's awkward to mix passing RBI and the Subtarget; should we pass
221   // TII/TRI as well?
222   InstSelector.reset(createAArch64InstructionSelector(
223       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
224 
225   RegBankInfo.reset(RBI);
226 }
227 
228 const CallLowering *AArch64Subtarget::getCallLowering() const {
229   return CallLoweringInfo.get();
230 }
231 
232 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
233   return InlineAsmLoweringInfo.get();
234 }
235 
236 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
237   return InstSelector.get();
238 }
239 
240 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
241   return Legalizer.get();
242 }
243 
244 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
245   return RegBankInfo.get();
246 }
247 
248 /// Find the target operand flags that describe how a global value should be
249 /// referenced for the current subtarget.
250 unsigned
251 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
252                                           const TargetMachine &TM) const {
253   // MachO large model always goes via a GOT, simply to get a single 8-byte
254   // absolute relocation on all global addresses.
255   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
256     return AArch64II::MO_GOT;
257 
258   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
259     if (GV->hasDLLImportStorageClass())
260       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
261     if (getTargetTriple().isOSWindows())
262       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
263     return AArch64II::MO_GOT;
264   }
265 
266   // The small code model's direct accesses use ADRP, which cannot
267   // necessarily produce the value 0 (if the code is above 4GB).
268   // Same for the tiny code model, where we have a pc relative LDR.
269   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
270       GV->hasExternalWeakLinkage())
271     return AArch64II::MO_GOT;
272 
273   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
274   // that their nominal addresses are tagged and outside of the code model. In
275   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
276   // tag if necessary based on MO_TAGGED.
277   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
278     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
279 
280   return AArch64II::MO_NO_FLAG;
281 }
282 
283 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
284     const GlobalValue *GV, const TargetMachine &TM) const {
285   // MachO large model always goes via a GOT, because we don't have the
286   // relocations available to do anything else..
287   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
288       !GV->hasInternalLinkage())
289     return AArch64II::MO_GOT;
290 
291   // NonLazyBind goes via GOT unless we know it's available locally.
292   auto *F = dyn_cast<Function>(GV);
293   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
294       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
295     return AArch64II::MO_GOT;
296 
297   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
298   if (getTargetTriple().isOSWindows())
299     return ClassifyGlobalReference(GV, TM);
300 
301   return AArch64II::MO_NO_FLAG;
302 }
303 
304 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
305                                            unsigned NumRegionInstrs) const {
306   // LNT run (at least on Cyclone) showed reasonably significant gains for
307   // bi-directional scheduling. 253.perlbmk.
308   Policy.OnlyTopDown = false;
309   Policy.OnlyBottomUp = false;
310   // Enabling or Disabling the latency heuristic is a close call: It seems to
311   // help nearly no benchmark on out-of-order architectures, on the other hand
312   // it regresses register pressure on a few benchmarking.
313   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
314 }
315 
316 bool AArch64Subtarget::enableEarlyIfConversion() const {
317   return EnableEarlyIfConvert;
318 }
319 
320 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
321   if (!UseAddressTopByteIgnored)
322     return false;
323 
324   if (TargetTriple.isiOS()) {
325     unsigned Major, Minor, Micro;
326     TargetTriple.getiOSVersion(Major, Minor, Micro);
327     return Major >= 8;
328   }
329 
330   return false;
331 }
332 
333 std::unique_ptr<PBQPRAConstraint>
334 AArch64Subtarget::getCustomPBQPConstraints() const {
335   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
336 }
337 
338 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
339   // We usually compute max call frame size after ISel. Do the computation now
340   // if the .mir file didn't specify it. Note that this will probably give you
341   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
342   // instructions, specify explicitly if you need it to be correct.
343   MachineFrameInfo &MFI = MF.getFrameInfo();
344   if (!MFI.isMaxCallFrameSizeComputed())
345     MFI.computeMaxCallFrameSize(MF);
346 }
347 
348 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
349   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
350   assert(SVEVectorBitsMax % 128 == 0 &&
351          "SVE requires vector length in multiples of 128!");
352   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
353          "Minimum SVE vector size should not be larger than its maximum!");
354   if (SVEVectorBitsMax == 0)
355     return 0;
356   return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
357 }
358 
359 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
360   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
361   assert(SVEVectorBitsMin % 128 == 0 &&
362          "SVE requires vector length in multiples of 128!");
363   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
364          "Minimum SVE vector size should not be larger than its maximum!");
365   if (SVEVectorBitsMax == 0)
366     return (SVEVectorBitsMin / 128) * 128;
367   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
368 }
369