xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- ARMSubtarget.cpp - ARM Subtarget Information ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARM specific subclass of TargetSubtargetInfo.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ARM.h"
14 
15 #include "ARMCallLowering.h"
16 #include "ARMFrameLowering.h"
17 #include "ARMInstrInfo.h"
18 #include "ARMLegalizerInfo.h"
19 #include "ARMRegisterBankInfo.h"
20 #include "ARMSubtarget.h"
21 #include "ARMTargetMachine.h"
22 #include "MCTargetDesc/ARMMCTargetDesc.h"
23 #include "Thumb1FrameLowering.h"
24 #include "Thumb1InstrInfo.h"
25 #include "Thumb2InstrInfo.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/Twine.h"
28 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
29 #include "llvm/CodeGen/MachineFrameInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/IR/Function.h"
32 #include "llvm/IR/GlobalValue.h"
33 #include "llvm/MC/MCAsmInfo.h"
34 #include "llvm/MC/MCTargetOptions.h"
35 #include "llvm/Support/CodeGen.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Target/TargetOptions.h"
38 #include "llvm/TargetParser/ARMTargetParser.h"
39 #include "llvm/TargetParser/Triple.h"
40 
41 using namespace llvm;
42 
43 #define DEBUG_TYPE "arm-subtarget"
44 
45 #define GET_SUBTARGETINFO_TARGET_DESC
46 #define GET_SUBTARGETINFO_CTOR
47 #include "ARMGenSubtargetInfo.inc"
48 
49 static cl::opt<bool>
50 UseFusedMulOps("arm-use-mulops",
51                cl::init(true), cl::Hidden);
52 
53 enum ITMode {
54   DefaultIT,
55   RestrictedIT
56 };
57 
58 static cl::opt<ITMode>
59     IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
60        cl::values(clEnumValN(DefaultIT, "arm-default-it",
61                              "Generate any type of IT block"),
62                   clEnumValN(RestrictedIT, "arm-restrict-it",
63                              "Disallow complex IT blocks")));
64 
65 /// ForceFastISel - Use the fast-isel, even for subtargets where it is not
66 /// currently supported (for testing only).
67 static cl::opt<bool>
68 ForceFastISel("arm-force-fast-isel",
69                cl::init(false), cl::Hidden);
70 
71 /// initializeSubtargetDependencies - Initializes using a CPU and feature string
72 /// so that we can use initializer lists for subtarget initialization.
initializeSubtargetDependencies(StringRef CPU,StringRef FS)73 ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
74                                                             StringRef FS) {
75   initSubtargetFeatures(CPU, FS);
76   return *this;
77 }
78 
initializeFrameLowering(StringRef CPU,StringRef FS)79 ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
80                                                         StringRef FS) {
81   ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
82   if (STI.isThumb1Only())
83     return (ARMFrameLowering *)new Thumb1FrameLowering(STI);
84 
85   return new ARMFrameLowering(STI);
86 }
87 
ARMSubtarget(const Triple & TT,const std::string & CPU,const std::string & FS,const ARMBaseTargetMachine & TM,bool IsLittle,bool MinSize)88 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
89                            const std::string &FS,
90                            const ARMBaseTargetMachine &TM, bool IsLittle,
91                            bool MinSize)
92     : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
93       UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize),
94       IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM),
95       FrameLowering(initializeFrameLowering(CPU, FS)),
96       // At this point initializeSubtargetDependencies has been called so
97       // we can query directly.
98       InstrInfo(isThumb1Only()
99                     ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this)
100                     : !isThumb()
101                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
102                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
103       TLInfo(TM, *this) {
104 
105   CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering()));
106   Legalizer.reset(new ARMLegalizerInfo(*this));
107 
108   auto *RBI = new ARMRegisterBankInfo(*getRegisterInfo());
109 
110   // FIXME: At this point, we can't rely on Subtarget having RBI.
111   // It's awkward to mix passing RBI and the Subtarget; should we pass
112   // TII/TRI as well?
113   InstSelector.reset(createARMInstructionSelector(TM, *this, *RBI));
114 
115   RegBankInfo.reset(RBI);
116 }
117 
getCallLowering() const118 const CallLowering *ARMSubtarget::getCallLowering() const {
119   return CallLoweringInfo.get();
120 }
121 
getInstructionSelector() const122 InstructionSelector *ARMSubtarget::getInstructionSelector() const {
123   return InstSelector.get();
124 }
125 
getLegalizerInfo() const126 const LegalizerInfo *ARMSubtarget::getLegalizerInfo() const {
127   return Legalizer.get();
128 }
129 
getRegBankInfo() const130 const RegisterBankInfo *ARMSubtarget::getRegBankInfo() const {
131   return RegBankInfo.get();
132 }
133 
isXRaySupported() const134 bool ARMSubtarget::isXRaySupported() const {
135   // We don't currently suppport Thumb, but Windows requires Thumb.
136   return hasV6Ops() && hasARMOps() && !isTargetWindows();
137 }
138 
initSubtargetFeatures(StringRef CPU,StringRef FS)139 void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
140   if (CPUString.empty()) {
141     CPUString = "generic";
142 
143     if (isTargetDarwin()) {
144       StringRef ArchName = TargetTriple.getArchName();
145       ARM::ArchKind AK = ARM::parseArch(ArchName);
146       if (AK == ARM::ArchKind::ARMV7S)
147         // Default to the Swift CPU when targeting armv7s/thumbv7s.
148         CPUString = "swift";
149       else if (AK == ARM::ArchKind::ARMV7K)
150         // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
151         // ARMv7k does not use SjLj exception handling.
152         CPUString = "cortex-a7";
153     }
154   }
155 
156   // Insert the architecture feature derived from the target triple into the
157   // feature string. This is important for setting features that are implied
158   // based on the architecture version.
159   std::string ArchFS = ARM_MC::ParseARMTriple(TargetTriple, CPUString);
160   if (!FS.empty()) {
161     if (!ArchFS.empty())
162       ArchFS = (Twine(ArchFS) + "," + FS).str();
163     else
164       ArchFS = std::string(FS);
165   }
166   ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, ArchFS);
167 
168   // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
169   // Assert this for now to make the change obvious.
170   assert(hasV6T2Ops() || !hasThumb2());
171 
172   if (genExecuteOnly()) {
173     // Execute only support for >= v8-M Baseline requires movt support
174     if (hasV8MBaselineOps())
175       NoMovt = false;
176     if (!hasV6MOps())
177       report_fatal_error("Cannot generate execute-only code for this target");
178   }
179 
180   // Keep a pointer to static instruction cost data for the specified CPU.
181   SchedModel = getSchedModelForCPU(CPUString);
182 
183   // Initialize scheduling itinerary for the specified CPU.
184   InstrItins = getInstrItineraryForCPU(CPUString);
185 
186   // FIXME: this is invalid for WindowsCE
187   if (isTargetWindows())
188     NoARM = true;
189 
190   if (TM.isAAPCS_ABI())
191     stackAlignment = Align(8);
192   if (isTargetNaCl() || TM.isAAPCS16_ABI())
193     stackAlignment = Align(16);
194 
195   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
196   // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as
197   // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation
198   // support in the assembler and linker to be used. This would need to be
199   // fixed to fully support tail calls in Thumb1.
200   //
201   // For ARMv8-M, we /do/ implement tail calls.  Doing this is tricky for v8-M
202   // baseline, since the LDM/POP instruction on Thumb doesn't take LR.  This
203   // means if we need to reload LR, it takes extra instructions, which outweighs
204   // the value of the tail call; but here we don't know yet whether LR is going
205   // to be used. We take the optimistic approach of generating the tail call and
206   // perhaps taking a hit if we need to restore the LR.
207 
208   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
209   // but we need to make sure there are enough registers; the only valid
210   // registers are the 4 used for parameters.  We don't currently do this
211   // case.
212 
213   SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps();
214 
215   switch (IT) {
216   case DefaultIT:
217     RestrictIT = false;
218     break;
219   case RestrictedIT:
220     RestrictIT = true;
221     break;
222   }
223 
224   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
225   const FeatureBitset &Bits = getFeatureBits();
226   if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters
227       (Options.UnsafeFPMath || isTargetDarwin()))
228     HasNEONForFP = true;
229 
230   if (isRWPI())
231     ReserveR9 = true;
232 
233   // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
234   if (MVEVectorCostFactor == 0)
235     MVEVectorCostFactor = 2;
236 
237   // FIXME: Teach TableGen to deal with these instead of doing it manually here.
238   switch (ARMProcFamily) {
239   case Others:
240   case CortexA5:
241     break;
242   case CortexA7:
243     LdStMultipleTiming = DoubleIssue;
244     break;
245   case CortexA8:
246     LdStMultipleTiming = DoubleIssue;
247     break;
248   case CortexA9:
249     LdStMultipleTiming = DoubleIssueCheckUnalignedAccess;
250     PreISelOperandLatencyAdjustment = 1;
251     break;
252   case CortexA12:
253     break;
254   case CortexA15:
255     MaxInterleaveFactor = 2;
256     PreISelOperandLatencyAdjustment = 1;
257     PartialUpdateClearance = 12;
258     break;
259   case CortexA17:
260   case CortexA32:
261   case CortexA35:
262   case CortexA53:
263   case CortexA55:
264   case CortexA57:
265   case CortexA72:
266   case CortexA73:
267   case CortexA75:
268   case CortexA76:
269   case CortexA77:
270   case CortexA78:
271   case CortexA78AE:
272   case CortexA78C:
273   case CortexA510:
274   case CortexA710:
275   case CortexR4:
276   case CortexR5:
277   case CortexR7:
278   case CortexM3:
279   case CortexM55:
280   case CortexM7:
281   case CortexM85:
282   case CortexR52:
283   case CortexR52plus:
284   case CortexX1:
285   case CortexX1C:
286     break;
287   case Exynos:
288     LdStMultipleTiming = SingleIssuePlusExtras;
289     MaxInterleaveFactor = 4;
290     if (!isThumb())
291       PreferBranchLogAlignment = 3;
292     break;
293   case Kryo:
294     break;
295   case Krait:
296     PreISelOperandLatencyAdjustment = 1;
297     break;
298   case NeoverseV1:
299     break;
300   case Swift:
301     MaxInterleaveFactor = 2;
302     LdStMultipleTiming = SingleIssuePlusExtras;
303     PreISelOperandLatencyAdjustment = 1;
304     PartialUpdateClearance = 12;
305     break;
306   }
307 }
308 
isROPI() const309 bool ARMSubtarget::isROPI() const {
310   return TM.getRelocationModel() == Reloc::ROPI ||
311          TM.getRelocationModel() == Reloc::ROPI_RWPI;
312 }
isRWPI() const313 bool ARMSubtarget::isRWPI() const {
314   return TM.getRelocationModel() == Reloc::RWPI ||
315          TM.getRelocationModel() == Reloc::ROPI_RWPI;
316 }
317 
isGVIndirectSymbol(const GlobalValue * GV) const318 bool ARMSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
319   if (!TM.shouldAssumeDSOLocal(GV))
320     return true;
321 
322   // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
323   // the section that is being relocated. This means we have to use o load even
324   // for GVs that are known to be local to the dso.
325   if (isTargetMachO() && TM.isPositionIndependent() &&
326       (GV->isDeclarationForLinker() || GV->hasCommonLinkage()))
327     return true;
328 
329   return false;
330 }
331 
isGVInGOT(const GlobalValue * GV) const332 bool ARMSubtarget::isGVInGOT(const GlobalValue *GV) const {
333   return isTargetELF() && TM.isPositionIndependent() && !GV->isDSOLocal();
334 }
335 
getMispredictionPenalty() const336 unsigned ARMSubtarget::getMispredictionPenalty() const {
337   return SchedModel.MispredictPenalty;
338 }
339 
enableMachineScheduler() const340 bool ARMSubtarget::enableMachineScheduler() const {
341   // The MachineScheduler can increase register usage, so we use more high
342   // registers and end up with more T2 instructions that cannot be converted to
343   // T1 instructions. At least until we do better at converting to thumb1
344   // instructions, on cortex-m at Oz where we are size-paranoid, don't use the
345   // Machine scheduler, relying on the DAG register pressure scheduler instead.
346   if (isMClass() && hasMinSize())
347     return false;
348   // Enable the MachineScheduler before register allocation for subtargets
349   // with the use-misched feature.
350   return useMachineScheduler();
351 }
352 
enableSubRegLiveness() const353 bool ARMSubtarget::enableSubRegLiveness() const {
354   // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs
355   // and q subregs for qqqqpr regs.
356   return hasMVEIntegerOps();
357 }
358 
enableMachinePipeliner() const359 bool ARMSubtarget::enableMachinePipeliner() const {
360   // Enable the MachinePipeliner before register allocation for subtargets
361   // with the use-mipipeliner feature.
362   return getSchedModel().hasInstrSchedModel() && useMachinePipeliner();
363 }
364 
useDFAforSMS() const365 bool ARMSubtarget::useDFAforSMS() const { return false; }
366 
367 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
enablePostRAScheduler() const368 bool ARMSubtarget::enablePostRAScheduler() const {
369   if (enableMachineScheduler())
370     return false;
371   if (disablePostRAScheduler())
372     return false;
373   // Thumb1 cores will generally not benefit from post-ra scheduling
374   return !isThumb1Only();
375 }
376 
enablePostRAMachineScheduler() const377 bool ARMSubtarget::enablePostRAMachineScheduler() const {
378   if (!enableMachineScheduler())
379     return false;
380   if (disablePostRAScheduler())
381     return false;
382   return !isThumb1Only();
383 }
384 
useStride4VFPs() const385 bool ARMSubtarget::useStride4VFPs() const {
386   // For general targets, the prologue can grow when VFPs are allocated with
387   // stride 4 (more vpush instructions). But WatchOS uses a compact unwind
388   // format which it's more important to get right.
389   return isTargetWatchABI() ||
390          (useWideStrideVFP() && !OptMinSize);
391 }
392 
useMovt() const393 bool ARMSubtarget::useMovt() const {
394   // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
395   // immediates as it is inherently position independent, and may be out of
396   // range otherwise.
397   return !NoMovt && hasV8MBaselineOps() &&
398          (isTargetWindows() || !OptMinSize || genExecuteOnly());
399 }
400 
useFastISel() const401 bool ARMSubtarget::useFastISel() const {
402   // Enable fast-isel for any target, for testing only.
403   if (ForceFastISel)
404     return true;
405 
406   // Limit fast-isel to the targets that are or have been tested.
407   if (!hasV6Ops())
408     return false;
409 
410   // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
411   return TM.Options.EnableFastISel &&
412          ((isTargetMachO() && !isThumb1Only()) ||
413           (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
414 }
415 
getGPRAllocationOrder(const MachineFunction & MF) const416 unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
417   // The GPR register class has multiple possible allocation orders, with
418   // tradeoffs preferred by different sub-architectures and optimisation goals.
419   // The allocation orders are:
420   // 0: (the default tablegen order, not used)
421   // 1: r14, r0-r13
422   // 2: r0-r7
423   // 3: r0-r7, r12, lr, r8-r11
424   // Note that the register allocator will change this order so that
425   // callee-saved registers are used later, as they require extra work in the
426   // prologue/epilogue (though we sometimes override that).
427 
428   // For thumb1-only targets, only the low registers are allocatable.
429   if (isThumb1Only())
430     return 2;
431 
432   // Allocate low registers first, so we can select more 16-bit instructions.
433   // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
434   // with regards to callee-saved registers, because pushing extra registers is
435   // much cheaper (in terms of code size) than using high registers. After
436   // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
437   // can return with the pop, don't need an extra "bx lr") and then the rest of
438   // the high registers.
439   if (isThumb2() && MF.getFunction().hasMinSize())
440     return 3;
441 
442   // Otherwise, allocate in the default order, using LR first because saving it
443   // allows a shorter epilogue sequence.
444   return 1;
445 }
446 
ignoreCSRForAllocationOrder(const MachineFunction & MF,MCRegister PhysReg) const447 bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
448                                                MCRegister PhysReg) const {
449   // To minimize code size in Thumb2, we prefer the usage of low regs (lower
450   // cost per use) so we can  use narrow encoding. By default, caller-saved
451   // registers (e.g. lr, r12) are always  allocated first, regardless of
452   // their cost per use. When optForMinSize, we prefer the low regs even if
453   // they are CSR because usually push/pop can be folded into existing ones.
454   return isThumb2() && MF.getFunction().hasMinSize() &&
455          ARM::GPRRegClass.contains(PhysReg);
456 }
457 
458 ARMSubtarget::PushPopSplitVariation
getPushPopSplitVariation(const MachineFunction & MF) const459 ARMSubtarget::getPushPopSplitVariation(const MachineFunction &MF) const {
460   const Function &F = MF.getFunction();
461   const MachineFrameInfo &MFI = MF.getFrameInfo();
462 
463   // Thumb1 always splits the pushes at R7, because the Thumb1 push instruction
464   // cannot use high registers except for lr.
465   if (isThumb1Only())
466     return SplitR7;
467 
468   // If R7 is the frame pointer, we must split at R7 to ensure that the
469   // previous frame pointer (R7) and return address (LR) are adjacent on the
470   // stack, to form a valid frame record.
471   if (getFramePointerReg() == ARM::R7 &&
472       MF.getTarget().Options.FramePointerIsReserved(MF))
473     return SplitR7;
474 
475   // Returns SplitR11WindowsSEH when the stack pointer needs to be
476   // restored from the frame pointer r11 + an offset and Windows CFI is enabled.
477   // This stack unwinding cannot be expressed with SEH unwind opcodes when done
478   // with a single push, making it necessary to split the push into r4-r10, and
479   // another containing r11+lr.
480   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
481       F.needsUnwindTableEntry() &&
482       (MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF)))
483     return SplitR11WindowsSEH;
484 
485   // Returns SplitR11AAPCSSignRA when the frame pointer is R11, requiring R11
486   // and LR to be adjacent on the stack, and branch signing is enabled,
487   // requiring R12 to be on the stack.
488   if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress() &&
489       getFramePointerReg() == ARM::R11 &&
490       MF.getTarget().Options.FramePointerIsReserved(MF))
491     return SplitR11AAPCSSignRA;
492   return NoSplit;
493 }
494