1 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 implementation of TargetFrameLowering class.
10 //
11 // On AArch64, stack frames are structured as follows:
12 //
13 // The stack grows downward.
14 //
15 // All of the individual frame areas on the frame below are optional, i.e. it's
16 // possible to create a function so that the particular area isn't present
17 // in the frame.
18 //
19 // At function entry, the "frame" looks as follows:
20 //
21 // | | Higher address
22 // |-----------------------------------|
23 // | |
24 // | arguments passed on the stack |
25 // | |
26 // |-----------------------------------| <- sp
27 // | | Lower address
28 //
29 //
30 // After the prologue has run, the frame has the following general structure.
31 // Note that this doesn't depict the case where a red-zone is used. Also,
32 // technically the last frame area (VLAs) doesn't get created until in the
33 // main function body, after the prologue is run. However, it's depicted here
34 // for completeness.
35 //
36 // | | Higher address
37 // |-----------------------------------|
38 // | |
39 // | arguments passed on the stack |
40 // | |
41 // |-----------------------------------|
42 // | |
43 // | (Win64 only) varargs from reg |
44 // | |
45 // |-----------------------------------|
46 // | |
47 // | (Win64 only) callee-saved SVE reg |
48 // | |
49 // |-----------------------------------|
50 // | |
51 // | callee-saved gpr registers | <--.
52 // | | | On Darwin platforms these
53 // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
54 // | prev_lr | | (frame record first)
55 // | prev_fp | <--'
56 // | async context if needed |
57 // | (a.k.a. "frame record") |
58 // |-----------------------------------| <- fp(=x29)
59 // | <hazard padding> |
60 // |-----------------------------------|
61 // | |
62 // | callee-saved fp/simd/SVE regs |
63 // | |
64 // |-----------------------------------|
65 // | |
66 // | SVE stack objects |
67 // | |
68 // |-----------------------------------|
69 // |.empty.space.to.make.part.below....|
70 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
71 // |.the.standard.16-byte.alignment....| compile time; if present)
72 // |-----------------------------------|
73 // | local variables of fixed size |
74 // | including spill slots |
75 // | <FPR> |
76 // | <hazard padding> |
77 // | <GPR> |
78 // |-----------------------------------| <- bp(not defined by ABI,
79 // |.variable-sized.local.variables....| LLVM chooses X19)
80 // |.(VLAs)............................| (size of this area is unknown at
81 // |...................................| compile time)
82 // |-----------------------------------| <- sp
83 // | | Lower address
84 //
85 //
86 // To access the data in a frame, at-compile time, a constant offset must be
87 // computable from one of the pointers (fp, bp, sp) to access it. The size
88 // of the areas with a dotted background cannot be computed at compile-time
89 // if they are present, making it required to have all three of fp, bp and
90 // sp to be set up to be able to access all contents in the frame areas,
91 // assuming all of the frame areas are non-empty.
92 //
93 // For most functions, some of the frame areas are empty. For those functions,
94 // it may not be necessary to set up fp or bp:
95 // * A base pointer is definitely needed when there are both VLAs and local
96 // variables with more-than-default alignment requirements.
97 // * A frame pointer is definitely needed when there are local variables with
98 // more-than-default alignment requirements.
99 //
100 // For Darwin platforms the frame-record (fp, lr) is stored at the top of the
101 // callee-saved area, since the unwind encoding does not allow for encoding
102 // this dynamically and existing tools depend on this layout. For other
103 // platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
104 // area to allow SVE stack objects (allocated directly below the callee-saves,
105 // if available) to be accessed directly from the framepointer.
106 // The SVE spill/fill instructions have VL-scaled addressing modes such
107 // as:
108 // ldr z8, [fp, #-7 mul vl]
109 // For SVE the size of the vector length (VL) is not known at compile-time, so
110 // '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
111 // layout, we don't need to add an unscaled offset to the framepointer before
112 // accessing the SVE object in the frame.
113 //
114 // In some cases when a base pointer is not strictly needed, it is generated
115 // anyway when offsets from the frame pointer to access local variables become
116 // so large that the offset can't be encoded in the immediate fields of loads
117 // or stores.
118 //
119 // Outgoing function arguments must be at the bottom of the stack frame when
120 // calling another function. If we do not have variable-sized stack objects, we
121 // can allocate a "reserved call frame" area at the bottom of the local
122 // variable area, large enough for all outgoing calls. If we do have VLAs, then
123 // the stack pointer must be decremented and incremented around each call to
124 // make space for the arguments below the VLAs.
125 //
126 // FIXME: also explain the redzone concept.
127 //
128 // About stack hazards: Under some SME contexts, a coprocessor with its own
129 // separate cache can used for FP operations. This can create hazards if the CPU
130 // and the SME unit try to access the same area of memory, including if the
131 // access is to an area of the stack. To try to alleviate this we attempt to
132 // introduce extra padding into the stack frame between FP and GPR accesses,
133 // controlled by the aarch64-stack-hazard-size option. Without changing the
134 // layout of the stack frame in the diagram above, a stack object of size
135 // aarch64-stack-hazard-size is added between GPR and FPR CSRs. Another is added
136 // to the stack objects section, and stack objects are sorted so that FPR >
137 // Hazard padding slot > GPRs (where possible). Unfortunately some things are
138 // not handled well (VLA area, arguments on the stack, objects with both GPR and
139 // FPR accesses), but if those are controlled by the user then the entire stack
140 // frame becomes GPR at the start/end with FPR in the middle, surrounded by
141 // Hazard padding.
142 //
143 // An example of the prologue:
144 //
145 // .globl __foo
146 // .align 2
147 // __foo:
148 // Ltmp0:
149 // .cfi_startproc
150 // .cfi_personality 155, ___gxx_personality_v0
151 // Leh_func_begin:
152 // .cfi_lsda 16, Lexception33
153 //
154 // stp xa,bx, [sp, -#offset]!
155 // ...
156 // stp x28, x27, [sp, #offset-32]
157 // stp fp, lr, [sp, #offset-16]
158 // add fp, sp, #offset - 16
159 // sub sp, sp, #1360
160 //
161 // The Stack:
162 // +-------------------------------------------+
163 // 10000 | ........ | ........ | ........ | ........ |
164 // 10004 | ........ | ........ | ........ | ........ |
165 // +-------------------------------------------+
166 // 10008 | ........ | ........ | ........ | ........ |
167 // 1000c | ........ | ........ | ........ | ........ |
168 // +===========================================+
169 // 10010 | X28 Register |
170 // 10014 | X28 Register |
171 // +-------------------------------------------+
172 // 10018 | X27 Register |
173 // 1001c | X27 Register |
174 // +===========================================+
175 // 10020 | Frame Pointer |
176 // 10024 | Frame Pointer |
177 // +-------------------------------------------+
178 // 10028 | Link Register |
179 // 1002c | Link Register |
180 // +===========================================+
181 // 10030 | ........ | ........ | ........ | ........ |
182 // 10034 | ........ | ........ | ........ | ........ |
183 // +-------------------------------------------+
184 // 10038 | ........ | ........ | ........ | ........ |
185 // 1003c | ........ | ........ | ........ | ........ |
186 // +-------------------------------------------+
187 //
188 // [sp] = 10030 :: >>initial value<<
189 // sp = 10020 :: stp fp, lr, [sp, #-16]!
190 // fp = sp == 10020 :: mov fp, sp
191 // [sp] == 10020 :: stp x28, x27, [sp, #-16]!
192 // sp == 10010 :: >>final value<<
193 //
194 // The frame pointer (w29) points to address 10020. If we use an offset of
195 // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
196 // for w27, and -32 for w28:
197 //
198 // Ltmp1:
199 // .cfi_def_cfa w29, 16
200 // Ltmp2:
201 // .cfi_offset w30, -8
202 // Ltmp3:
203 // .cfi_offset w29, -16
204 // Ltmp4:
205 // .cfi_offset w27, -24
206 // Ltmp5:
207 // .cfi_offset w28, -32
208 //
209 //===----------------------------------------------------------------------===//
210
211 #include "AArch64FrameLowering.h"
212 #include "AArch64InstrInfo.h"
213 #include "AArch64MachineFunctionInfo.h"
214 #include "AArch64RegisterInfo.h"
215 #include "AArch64Subtarget.h"
216 #include "MCTargetDesc/AArch64AddressingModes.h"
217 #include "MCTargetDesc/AArch64MCTargetDesc.h"
218 #include "Utils/AArch64SMEAttributes.h"
219 #include "llvm/ADT/ScopeExit.h"
220 #include "llvm/ADT/SmallVector.h"
221 #include "llvm/ADT/Statistic.h"
222 #include "llvm/Analysis/ValueTracking.h"
223 #include "llvm/CodeGen/CFIInstBuilder.h"
224 #include "llvm/CodeGen/LivePhysRegs.h"
225 #include "llvm/CodeGen/MachineBasicBlock.h"
226 #include "llvm/CodeGen/MachineFrameInfo.h"
227 #include "llvm/CodeGen/MachineFunction.h"
228 #include "llvm/CodeGen/MachineInstr.h"
229 #include "llvm/CodeGen/MachineInstrBuilder.h"
230 #include "llvm/CodeGen/MachineMemOperand.h"
231 #include "llvm/CodeGen/MachineModuleInfo.h"
232 #include "llvm/CodeGen/MachineOperand.h"
233 #include "llvm/CodeGen/MachineRegisterInfo.h"
234 #include "llvm/CodeGen/RegisterScavenging.h"
235 #include "llvm/CodeGen/TargetInstrInfo.h"
236 #include "llvm/CodeGen/TargetRegisterInfo.h"
237 #include "llvm/CodeGen/TargetSubtargetInfo.h"
238 #include "llvm/CodeGen/WinEHFuncInfo.h"
239 #include "llvm/IR/Attributes.h"
240 #include "llvm/IR/CallingConv.h"
241 #include "llvm/IR/DataLayout.h"
242 #include "llvm/IR/DebugLoc.h"
243 #include "llvm/IR/Function.h"
244 #include "llvm/MC/MCAsmInfo.h"
245 #include "llvm/MC/MCDwarf.h"
246 #include "llvm/Support/CommandLine.h"
247 #include "llvm/Support/Debug.h"
248 #include "llvm/Support/ErrorHandling.h"
249 #include "llvm/Support/FormatVariadic.h"
250 #include "llvm/Support/MathExtras.h"
251 #include "llvm/Support/raw_ostream.h"
252 #include "llvm/Target/TargetMachine.h"
253 #include "llvm/Target/TargetOptions.h"
254 #include <cassert>
255 #include <cstdint>
256 #include <iterator>
257 #include <optional>
258 #include <vector>
259
260 using namespace llvm;
261
262 #define DEBUG_TYPE "frame-info"
263
264 static cl::opt<bool> EnableRedZone("aarch64-redzone",
265 cl::desc("enable use of redzone on AArch64"),
266 cl::init(false), cl::Hidden);
267
268 static cl::opt<bool> StackTaggingMergeSetTag(
269 "stack-tagging-merge-settag",
270 cl::desc("merge settag instruction in function epilog"), cl::init(true),
271 cl::Hidden);
272
273 static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
274 cl::desc("sort stack allocations"),
275 cl::init(true), cl::Hidden);
276
277 cl::opt<bool> EnableHomogeneousPrologEpilog(
278 "homogeneous-prolog-epilog", cl::Hidden,
279 cl::desc("Emit homogeneous prologue and epilogue for the size "
280 "optimization (default = off)"));
281
282 // Stack hazard size for analysis remarks. StackHazardSize takes precedence.
283 static cl::opt<unsigned>
284 StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
285 cl::Hidden);
286 // Whether to insert padding into non-streaming functions (for testing).
287 static cl::opt<bool>
288 StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
289 cl::init(false), cl::Hidden);
290
291 static cl::opt<bool> DisableMultiVectorSpillFill(
292 "aarch64-disable-multivector-spill-fill",
293 cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
294 cl::Hidden);
295
296 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
297
298 /// Returns how much of the incoming argument stack area (in bytes) we should
299 /// clean up in an epilogue. For the C calling convention this will be 0, for
300 /// guaranteed tail call conventions it can be positive (a normal return or a
301 /// tail call to a function that uses less stack space for arguments) or
302 /// negative (for a tail call to a function that needs more stack space than us
303 /// for arguments).
getArgumentStackToRestore(MachineFunction & MF,MachineBasicBlock & MBB)304 static int64_t getArgumentStackToRestore(MachineFunction &MF,
305 MachineBasicBlock &MBB) {
306 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
307 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
308 bool IsTailCallReturn = (MBB.end() != MBBI)
309 ? AArch64InstrInfo::isTailCallReturnInst(*MBBI)
310 : false;
311
312 int64_t ArgumentPopSize = 0;
313 if (IsTailCallReturn) {
314 MachineOperand &StackAdjust = MBBI->getOperand(1);
315
316 // For a tail-call in a callee-pops-arguments environment, some or all of
317 // the stack may actually be in use for the call's arguments, this is
318 // calculated during LowerCall and consumed here...
319 ArgumentPopSize = StackAdjust.getImm();
320 } else {
321 // ... otherwise the amount to pop is *all* of the argument space,
322 // conveniently stored in the MachineFunctionInfo by
323 // LowerFormalArguments. This will, of course, be zero for the C calling
324 // convention.
325 ArgumentPopSize = AFI->getArgumentStackToRestore();
326 }
327
328 return ArgumentPopSize;
329 }
330
331 static bool produceCompactUnwindFrame(MachineFunction &MF);
332 static bool needsWinCFI(const MachineFunction &MF);
333 static StackOffset getSVEStackSize(const MachineFunction &MF);
334 static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
335 bool HasCall = false);
336 static bool requiresSaveVG(const MachineFunction &MF);
337
338 // Conservatively, returns true if the function is likely to have an SVE vectors
339 // on the stack. This function is safe to be called before callee-saves or
340 // object offsets have been determined.
isLikelyToHaveSVEStack(MachineFunction & MF)341 static bool isLikelyToHaveSVEStack(MachineFunction &MF) {
342 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
343 if (AFI->isSVECC())
344 return true;
345
346 if (AFI->hasCalculatedStackSizeSVE())
347 return bool(getSVEStackSize(MF));
348
349 const MachineFrameInfo &MFI = MF.getFrameInfo();
350 for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd(); FI++) {
351 if (MFI.getStackID(FI) == TargetStackID::ScalableVector)
352 return true;
353 }
354
355 return false;
356 }
357
358 /// Returns true if a homogeneous prolog or epilog code can be emitted
359 /// for the size optimization. If possible, a frame helper call is injected.
360 /// When Exit block is given, this check is for epilog.
homogeneousPrologEpilog(MachineFunction & MF,MachineBasicBlock * Exit) const361 bool AArch64FrameLowering::homogeneousPrologEpilog(
362 MachineFunction &MF, MachineBasicBlock *Exit) const {
363 if (!MF.getFunction().hasMinSize())
364 return false;
365 if (!EnableHomogeneousPrologEpilog)
366 return false;
367 if (EnableRedZone)
368 return false;
369
370 // TODO: Window is supported yet.
371 if (needsWinCFI(MF))
372 return false;
373
374 // TODO: SVE is not supported yet.
375 if (isLikelyToHaveSVEStack(MF))
376 return false;
377
378 // Bail on stack adjustment needed on return for simplicity.
379 const MachineFrameInfo &MFI = MF.getFrameInfo();
380 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
381 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
382 return false;
383 if (Exit && getArgumentStackToRestore(MF, *Exit))
384 return false;
385
386 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
387 if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
388 return false;
389
390 // If there are an odd number of GPRs before LR and FP in the CSRs list,
391 // they will not be paired into one RegPairInfo, which is incompatible with
392 // the assumption made by the homogeneous prolog epilog pass.
393 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
394 unsigned NumGPRs = 0;
395 for (unsigned I = 0; CSRegs[I]; ++I) {
396 Register Reg = CSRegs[I];
397 if (Reg == AArch64::LR) {
398 assert(CSRegs[I + 1] == AArch64::FP);
399 if (NumGPRs % 2 != 0)
400 return false;
401 break;
402 }
403 if (AArch64::GPR64RegClass.contains(Reg))
404 ++NumGPRs;
405 }
406
407 return true;
408 }
409
410 /// Returns true if CSRs should be paired.
producePairRegisters(MachineFunction & MF) const411 bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
412 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
413 }
414
415 /// This is the biggest offset to the stack pointer we can encode in aarch64
416 /// instructions (without using a separate calculation and a temp register).
417 /// Note that the exception here are vector stores/loads which cannot encode any
418 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
419 static const unsigned DefaultSafeSPDisplacement = 255;
420
421 /// Look at each instruction that references stack frames and return the stack
422 /// size limit beyond which some of these instructions will require a scratch
423 /// register during their expansion later.
estimateRSStackSizeLimit(MachineFunction & MF)424 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
425 // FIXME: For now, just conservatively guesstimate based on unscaled indexing
426 // range. We'll end up allocating an unnecessary spill slot a lot, but
427 // realistically that's not a big deal at this stage of the game.
428 for (MachineBasicBlock &MBB : MF) {
429 for (MachineInstr &MI : MBB) {
430 if (MI.isDebugInstr() || MI.isPseudo() ||
431 MI.getOpcode() == AArch64::ADDXri ||
432 MI.getOpcode() == AArch64::ADDSXri)
433 continue;
434
435 for (const MachineOperand &MO : MI.operands()) {
436 if (!MO.isFI())
437 continue;
438
439 StackOffset Offset;
440 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
441 AArch64FrameOffsetCannotUpdate)
442 return 0;
443 }
444 }
445 }
446 return DefaultSafeSPDisplacement;
447 }
448
449 TargetStackID::Value
getStackIDForScalableVectors() const450 AArch64FrameLowering::getStackIDForScalableVectors() const {
451 return TargetStackID::ScalableVector;
452 }
453
454 /// Returns the size of the fixed object area (allocated next to sp on entry)
455 /// On Win64 this may include a var args area and an UnwindHelp object for EH.
getFixedObjectSize(const MachineFunction & MF,const AArch64FunctionInfo * AFI,bool IsWin64,bool IsFunclet)456 static unsigned getFixedObjectSize(const MachineFunction &MF,
457 const AArch64FunctionInfo *AFI, bool IsWin64,
458 bool IsFunclet) {
459 assert(AFI->getTailCallReservedStack() % 16 == 0 &&
460 "Tail call reserved stack must be aligned to 16 bytes");
461 if (!IsWin64 || IsFunclet) {
462 return AFI->getTailCallReservedStack();
463 } else {
464 if (AFI->getTailCallReservedStack() != 0 &&
465 !MF.getFunction().getAttributes().hasAttrSomewhere(
466 Attribute::SwiftAsync))
467 report_fatal_error("cannot generate ABI-changing tail call for Win64");
468 unsigned FixedObjectSize = AFI->getTailCallReservedStack();
469
470 // Var args are stored here in the primary function.
471 FixedObjectSize += AFI->getVarArgsGPRSize();
472
473 if (MF.hasEHFunclets()) {
474 // Catch objects are stored here in the primary function.
475 const MachineFrameInfo &MFI = MF.getFrameInfo();
476 const WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
477 SmallSetVector<int, 8> CatchObjFrameIndices;
478 for (const WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
479 for (const WinEHHandlerType &H : TBME.HandlerArray) {
480 int FrameIndex = H.CatchObj.FrameIndex;
481 if ((FrameIndex != INT_MAX) &&
482 CatchObjFrameIndices.insert(FrameIndex)) {
483 FixedObjectSize = alignTo(FixedObjectSize,
484 MFI.getObjectAlign(FrameIndex).value()) +
485 MFI.getObjectSize(FrameIndex);
486 }
487 }
488 }
489 // To support EH funclets we allocate an UnwindHelp object
490 FixedObjectSize += 8;
491 }
492 return alignTo(FixedObjectSize, 16);
493 }
494 }
495
496 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
getSVEStackSize(const MachineFunction & MF)497 static StackOffset getSVEStackSize(const MachineFunction &MF) {
498 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
499 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
500 }
501
canUseRedZone(const MachineFunction & MF) const502 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
503 if (!EnableRedZone)
504 return false;
505
506 // Don't use the red zone if the function explicitly asks us not to.
507 // This is typically used for kernel code.
508 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
509 const unsigned RedZoneSize =
510 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
511 if (!RedZoneSize)
512 return false;
513
514 const MachineFrameInfo &MFI = MF.getFrameInfo();
515 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
516 uint64_t NumBytes = AFI->getLocalStackSize();
517
518 // If neither NEON or SVE are available, a COPY from one Q-reg to
519 // another requires a spill -> reload sequence. We can do that
520 // using a pre-decrementing store/post-decrementing load, but
521 // if we do so, we can't use the Red Zone.
522 bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
523 !Subtarget.isNeonAvailable() &&
524 !Subtarget.hasSVE();
525
526 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
527 getSVEStackSize(MF) || LowerQRegCopyThroughMem);
528 }
529
530 /// hasFPImpl - Return true if the specified function should have a dedicated
531 /// frame pointer register.
hasFPImpl(const MachineFunction & MF) const532 bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
533 const MachineFrameInfo &MFI = MF.getFrameInfo();
534 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
535
536 // Win64 EH requires a frame pointer if funclets are present, as the locals
537 // are accessed off the frame pointer in both the parent function and the
538 // funclets.
539 if (MF.hasEHFunclets())
540 return true;
541 // Retain behavior of always omitting the FP for leaf functions when possible.
542 if (MF.getTarget().Options.DisableFramePointerElim(MF))
543 return true;
544 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
545 MFI.hasStackMap() || MFI.hasPatchPoint() ||
546 RegInfo->hasStackRealignment(MF))
547 return true;
548 // With large callframes around we may need to use FP to access the scavenging
549 // emergency spillslot.
550 //
551 // Unfortunately some calls to hasFP() like machine verifier ->
552 // getReservedReg() -> hasFP in the middle of global isel are too early
553 // to know the max call frame size. Hopefully conservatively returning "true"
554 // in those cases is fine.
555 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
556 if (!MFI.isMaxCallFrameSizeComputed() ||
557 MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
558 return true;
559
560 return false;
561 }
562
563 /// Should the Frame Pointer be reserved for the current function?
isFPReserved(const MachineFunction & MF) const564 bool AArch64FrameLowering::isFPReserved(const MachineFunction &MF) const {
565 const TargetMachine &TM = MF.getTarget();
566 const Triple &TT = TM.getTargetTriple();
567
568 // These OSes require the frame chain is valid, even if the current frame does
569 // not use a frame pointer.
570 if (TT.isOSDarwin() || TT.isOSWindows())
571 return true;
572
573 // If the function has a frame pointer, it is reserved.
574 if (hasFP(MF))
575 return true;
576
577 // Frontend has requested to preserve the frame pointer.
578 if (TM.Options.FramePointerIsReserved(MF))
579 return true;
580
581 return false;
582 }
583
584 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
585 /// not required, we reserve argument space for call sites in the function
586 /// immediately on entry to the current function. This eliminates the need for
587 /// add/sub sp brackets around call sites. Returns true if the call frame is
588 /// included as part of the stack frame.
hasReservedCallFrame(const MachineFunction & MF) const589 bool AArch64FrameLowering::hasReservedCallFrame(
590 const MachineFunction &MF) const {
591 // The stack probing code for the dynamically allocated outgoing arguments
592 // area assumes that the stack is probed at the top - either by the prologue
593 // code, which issues a probe if `hasVarSizedObjects` return true, or by the
594 // most recent variable-sized object allocation. Changing the condition here
595 // may need to be followed up by changes to the probe issuing logic.
596 return !MF.getFrameInfo().hasVarSizedObjects();
597 }
598
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const599 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
600 MachineFunction &MF, MachineBasicBlock &MBB,
601 MachineBasicBlock::iterator I) const {
602 const AArch64InstrInfo *TII =
603 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
604 const AArch64TargetLowering *TLI =
605 MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
606 [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
607 DebugLoc DL = I->getDebugLoc();
608 unsigned Opc = I->getOpcode();
609 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
610 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
611
612 if (!hasReservedCallFrame(MF)) {
613 int64_t Amount = I->getOperand(0).getImm();
614 Amount = alignTo(Amount, getStackAlign());
615 if (!IsDestroy)
616 Amount = -Amount;
617
618 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
619 // doesn't have to pop anything), then the first operand will be zero too so
620 // this adjustment is a no-op.
621 if (CalleePopAmount == 0) {
622 // FIXME: in-function stack adjustment for calls is limited to 24-bits
623 // because there's no guaranteed temporary register available.
624 //
625 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
626 // 1) For offset <= 12-bit, we use LSL #0
627 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
628 // LSL #0, and the other uses LSL #12.
629 //
630 // Most call frames will be allocated at the start of a function so
631 // this is OK, but it is a limitation that needs dealing with.
632 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
633
634 if (TLI->hasInlineStackProbe(MF) &&
635 -Amount >= AArch64::StackProbeMaxUnprobedStack) {
636 // When stack probing is enabled, the decrement of SP may need to be
637 // probed. We only need to do this if the call site needs 1024 bytes of
638 // space or more, because a region smaller than that is allowed to be
639 // unprobed at an ABI boundary. We rely on the fact that SP has been
640 // probed exactly at this point, either by the prologue or most recent
641 // dynamic allocation.
642 assert(MFI.hasVarSizedObjects() &&
643 "non-reserved call frame without var sized objects?");
644 Register ScratchReg =
645 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
646 inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
647 } else {
648 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
649 StackOffset::getFixed(Amount), TII);
650 }
651 }
652 } else if (CalleePopAmount != 0) {
653 // If the calling convention demands that the callee pops arguments from the
654 // stack, we want to add it back if we have a reserved call frame.
655 assert(CalleePopAmount < 0xffffff && "call frame too large");
656 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
657 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
658 }
659 return MBB.erase(I);
660 }
661
emitCalleeSavedGPRLocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const662 void AArch64FrameLowering::emitCalleeSavedGPRLocations(
663 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
664 MachineFunction &MF = *MBB.getParent();
665 MachineFrameInfo &MFI = MF.getFrameInfo();
666 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
667 SMEAttrs Attrs = AFI->getSMEFnAttrs();
668 bool LocallyStreaming =
669 Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
670
671 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
672 if (CSI.empty())
673 return;
674
675 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
676 for (const auto &Info : CSI) {
677 unsigned FrameIdx = Info.getFrameIdx();
678 if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
679 continue;
680
681 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
682 int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
683
684 // The location of VG will be emitted before each streaming-mode change in
685 // the function. Only locally-streaming functions require emitting the
686 // non-streaming VG location here.
687 if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
688 (!LocallyStreaming && Info.getReg() == AArch64::VG))
689 continue;
690
691 CFIBuilder.buildOffset(Info.getReg(), Offset);
692 }
693 }
694
emitCalleeSavedSVELocations(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const695 void AArch64FrameLowering::emitCalleeSavedSVELocations(
696 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
697 MachineFunction &MF = *MBB.getParent();
698 MachineFrameInfo &MFI = MF.getFrameInfo();
699
700 // Add callee saved registers to move list.
701 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
702 if (CSI.empty())
703 return;
704
705 const TargetSubtargetInfo &STI = MF.getSubtarget();
706 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
707 AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
708 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
709
710 for (const auto &Info : CSI) {
711 if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
712 continue;
713
714 // Not all unwinders may know about SVE registers, so assume the lowest
715 // common denominator.
716 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
717 MCRegister Reg = Info.getReg();
718 if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
719 continue;
720
721 StackOffset Offset =
722 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
723 StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
724
725 CFIBuilder.insertCFIInst(createCFAOffset(TRI, Reg, Offset));
726 }
727 }
728
resetCFIToInitialState(MachineBasicBlock & MBB) const729 void AArch64FrameLowering::resetCFIToInitialState(
730 MachineBasicBlock &MBB) const {
731
732 MachineFunction &MF = *MBB.getParent();
733 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
734 const auto &TRI = *Subtarget.getRegisterInfo();
735 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
736
737 CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags);
738
739 // Reset the CFA to `SP + 0`.
740 CFIBuilder.buildDefCFA(AArch64::SP, 0);
741
742 // Flip the RA sign state.
743 if (MFI.shouldSignReturnAddress(MF))
744 MFI.branchProtectionPAuthLR() ? CFIBuilder.buildNegateRAStateWithPC()
745 : CFIBuilder.buildNegateRAState();
746
747 // Shadow call stack uses X18, reset it.
748 if (MFI.needsShadowCallStackPrologueEpilogue(MF))
749 CFIBuilder.buildSameValue(AArch64::X18);
750
751 // Emit .cfi_same_value for callee-saved registers.
752 const std::vector<CalleeSavedInfo> &CSI =
753 MF.getFrameInfo().getCalleeSavedInfo();
754 for (const auto &Info : CSI) {
755 MCRegister Reg = Info.getReg();
756 if (!TRI.regNeedsCFI(Reg, Reg))
757 continue;
758 CFIBuilder.buildSameValue(Reg);
759 }
760 }
761
emitCalleeSavedRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool SVE)762 static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
763 MachineBasicBlock::iterator MBBI,
764 bool SVE) {
765 MachineFunction &MF = *MBB.getParent();
766 MachineFrameInfo &MFI = MF.getFrameInfo();
767
768 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
769 if (CSI.empty())
770 return;
771
772 const TargetSubtargetInfo &STI = MF.getSubtarget();
773 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
774 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameDestroy);
775
776 for (const auto &Info : CSI) {
777 if (SVE !=
778 (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
779 continue;
780
781 MCRegister Reg = Info.getReg();
782 if (SVE &&
783 !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
784 continue;
785
786 if (!Info.isRestored())
787 continue;
788
789 CFIBuilder.buildRestore(Info.getReg());
790 }
791 }
792
emitCalleeSavedGPRRestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const793 void AArch64FrameLowering::emitCalleeSavedGPRRestores(
794 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
795 emitCalleeSavedRestores(MBB, MBBI, false);
796 }
797
emitCalleeSavedSVERestores(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI) const798 void AArch64FrameLowering::emitCalleeSavedSVERestores(
799 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
800 emitCalleeSavedRestores(MBB, MBBI, true);
801 }
802
803 // Return the maximum possible number of bytes for `Size` due to the
804 // architectural limit on the size of a SVE register.
upperBound(StackOffset Size)805 static int64_t upperBound(StackOffset Size) {
806 static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
807 return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
808 }
809
allocateStackSpace(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,int64_t RealignmentPadding,StackOffset AllocSize,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFI,StackOffset InitialOffset,bool FollowupAllocs) const810 void AArch64FrameLowering::allocateStackSpace(
811 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
812 int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
813 bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
814 bool FollowupAllocs) const {
815
816 if (!AllocSize)
817 return;
818
819 DebugLoc DL;
820 MachineFunction &MF = *MBB.getParent();
821 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
822 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
823 AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
824 const MachineFrameInfo &MFI = MF.getFrameInfo();
825
826 const int64_t MaxAlign = MFI.getMaxAlign().value();
827 const uint64_t AndMask = ~(MaxAlign - 1);
828
829 if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
830 Register TargetReg = RealignmentPadding
831 ? findScratchNonCalleeSaveRegister(&MBB)
832 : AArch64::SP;
833 // SUB Xd/SP, SP, AllocSize
834 emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
835 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
836 EmitCFI, InitialOffset);
837
838 if (RealignmentPadding) {
839 // AND SP, X9, 0b11111...0000
840 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
841 .addReg(TargetReg, RegState::Kill)
842 .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
843 .setMIFlags(MachineInstr::FrameSetup);
844 AFI.setStackRealigned(true);
845
846 // No need for SEH instructions here; if we're realigning the stack,
847 // we've set a frame pointer and already finished the SEH prologue.
848 assert(!NeedsWinCFI);
849 }
850 return;
851 }
852
853 //
854 // Stack probing allocation.
855 //
856
857 // Fixed length allocation. If we don't need to re-align the stack and don't
858 // have SVE objects, we can use a more efficient sequence for stack probing.
859 if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
860 Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
861 assert(ScratchReg != AArch64::NoRegister);
862 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
863 .addDef(ScratchReg)
864 .addImm(AllocSize.getFixed())
865 .addImm(InitialOffset.getFixed())
866 .addImm(InitialOffset.getScalable());
867 // The fixed allocation may leave unprobed bytes at the top of the
868 // stack. If we have subsequent allocation (e.g. if we have variable-sized
869 // objects), we need to issue an extra probe, so these allocations start in
870 // a known state.
871 if (FollowupAllocs) {
872 // STR XZR, [SP]
873 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
874 .addReg(AArch64::XZR)
875 .addReg(AArch64::SP)
876 .addImm(0)
877 .setMIFlags(MachineInstr::FrameSetup);
878 }
879
880 return;
881 }
882
883 // Variable length allocation.
884
885 // If the (unknown) allocation size cannot exceed the probe size, decrement
886 // the stack pointer right away.
887 int64_t ProbeSize = AFI.getStackProbeSize();
888 if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
889 Register ScratchReg = RealignmentPadding
890 ? findScratchNonCalleeSaveRegister(&MBB)
891 : AArch64::SP;
892 assert(ScratchReg != AArch64::NoRegister);
893 // SUB Xd, SP, AllocSize
894 emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
895 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
896 EmitCFI, InitialOffset);
897 if (RealignmentPadding) {
898 // AND SP, Xn, 0b11111...0000
899 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
900 .addReg(ScratchReg, RegState::Kill)
901 .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
902 .setMIFlags(MachineInstr::FrameSetup);
903 AFI.setStackRealigned(true);
904 }
905 if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
906 AArch64::StackProbeMaxUnprobedStack) {
907 // STR XZR, [SP]
908 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
909 .addReg(AArch64::XZR)
910 .addReg(AArch64::SP)
911 .addImm(0)
912 .setMIFlags(MachineInstr::FrameSetup);
913 }
914 return;
915 }
916
917 // Emit a variable-length allocation probing loop.
918 // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
919 // each of them guaranteed to adjust the stack by less than the probe size.
920 Register TargetReg = findScratchNonCalleeSaveRegister(&MBB);
921 assert(TargetReg != AArch64::NoRegister);
922 // SUB Xd, SP, AllocSize
923 emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
924 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
925 EmitCFI, InitialOffset);
926 if (RealignmentPadding) {
927 // AND Xn, Xn, 0b11111...0000
928 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
929 .addReg(TargetReg, RegState::Kill)
930 .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
931 .setMIFlags(MachineInstr::FrameSetup);
932 }
933
934 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
935 .addReg(TargetReg);
936 if (EmitCFI) {
937 // Set the CFA register back to SP.
938 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
939 .buildDefCFARegister(AArch64::SP);
940 }
941 if (RealignmentPadding)
942 AFI.setStackRealigned(true);
943 }
944
getRegisterOrZero(MCRegister Reg,bool HasSVE)945 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
946 switch (Reg.id()) {
947 default:
948 // The called routine is expected to preserve r19-r28
949 // r29 and r30 are used as frame pointer and link register resp.
950 return 0;
951
952 // GPRs
953 #define CASE(n) \
954 case AArch64::W##n: \
955 case AArch64::X##n: \
956 return AArch64::X##n
957 CASE(0);
958 CASE(1);
959 CASE(2);
960 CASE(3);
961 CASE(4);
962 CASE(5);
963 CASE(6);
964 CASE(7);
965 CASE(8);
966 CASE(9);
967 CASE(10);
968 CASE(11);
969 CASE(12);
970 CASE(13);
971 CASE(14);
972 CASE(15);
973 CASE(16);
974 CASE(17);
975 CASE(18);
976 #undef CASE
977
978 // FPRs
979 #define CASE(n) \
980 case AArch64::B##n: \
981 case AArch64::H##n: \
982 case AArch64::S##n: \
983 case AArch64::D##n: \
984 case AArch64::Q##n: \
985 return HasSVE ? AArch64::Z##n : AArch64::Q##n
986 CASE(0);
987 CASE(1);
988 CASE(2);
989 CASE(3);
990 CASE(4);
991 CASE(5);
992 CASE(6);
993 CASE(7);
994 CASE(8);
995 CASE(9);
996 CASE(10);
997 CASE(11);
998 CASE(12);
999 CASE(13);
1000 CASE(14);
1001 CASE(15);
1002 CASE(16);
1003 CASE(17);
1004 CASE(18);
1005 CASE(19);
1006 CASE(20);
1007 CASE(21);
1008 CASE(22);
1009 CASE(23);
1010 CASE(24);
1011 CASE(25);
1012 CASE(26);
1013 CASE(27);
1014 CASE(28);
1015 CASE(29);
1016 CASE(30);
1017 CASE(31);
1018 #undef CASE
1019 }
1020 }
1021
emitZeroCallUsedRegs(BitVector RegsToZero,MachineBasicBlock & MBB) const1022 void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
1023 MachineBasicBlock &MBB) const {
1024 // Insertion point.
1025 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1026
1027 // Fake a debug loc.
1028 DebugLoc DL;
1029 if (MBBI != MBB.end())
1030 DL = MBBI->getDebugLoc();
1031
1032 const MachineFunction &MF = *MBB.getParent();
1033 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
1034 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
1035
1036 BitVector GPRsToZero(TRI.getNumRegs());
1037 BitVector FPRsToZero(TRI.getNumRegs());
1038 bool HasSVE = STI.isSVEorStreamingSVEAvailable();
1039 for (MCRegister Reg : RegsToZero.set_bits()) {
1040 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
1041 // For GPRs, we only care to clear out the 64-bit register.
1042 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
1043 GPRsToZero.set(XReg);
1044 } else if (AArch64InstrInfo::isFpOrNEON(Reg)) {
1045 // For FPRs,
1046 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
1047 FPRsToZero.set(XReg);
1048 }
1049 }
1050
1051 const AArch64InstrInfo &TII = *STI.getInstrInfo();
1052
1053 // Zero out GPRs.
1054 for (MCRegister Reg : GPRsToZero.set_bits())
1055 TII.buildClearRegister(Reg, MBB, MBBI, DL);
1056
1057 // Zero out FP/vector registers.
1058 for (MCRegister Reg : FPRsToZero.set_bits())
1059 TII.buildClearRegister(Reg, MBB, MBBI, DL);
1060
1061 if (HasSVE) {
1062 for (MCRegister PReg :
1063 {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
1064 AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
1065 AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
1066 AArch64::P15}) {
1067 if (RegsToZero[PReg])
1068 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
1069 }
1070 }
1071 }
1072
windowsRequiresStackProbe(const MachineFunction & MF,uint64_t StackSizeInBytes)1073 static bool windowsRequiresStackProbe(const MachineFunction &MF,
1074 uint64_t StackSizeInBytes) {
1075 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1076 const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
1077 // TODO: When implementing stack protectors, take that into account
1078 // for the probe threshold.
1079 return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1080 StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1081 }
1082
getLiveRegsForEntryMBB(LivePhysRegs & LiveRegs,const MachineBasicBlock & MBB)1083 static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
1084 const MachineBasicBlock &MBB) {
1085 const MachineFunction *MF = MBB.getParent();
1086 LiveRegs.addLiveIns(MBB);
1087 // Mark callee saved registers as used so we will not choose them.
1088 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
1089 for (unsigned i = 0; CSRegs[i]; ++i)
1090 LiveRegs.addReg(CSRegs[i]);
1091 }
1092
1093 // Find a scratch register that we can use at the start of the prologue to
1094 // re-align the stack pointer. We avoid using callee-save registers since they
1095 // may appear to be free when this is called from canUseAsPrologue (during
1096 // shrink wrapping), but then no longer be free when this is called from
1097 // emitPrologue.
1098 //
1099 // FIXME: This is a bit conservative, since in the above case we could use one
1100 // of the callee-save registers as a scratch temp to re-align the stack pointer,
1101 // but we would then have to make sure that we were in fact saving at least one
1102 // callee-save register in the prologue, which is additional complexity that
1103 // doesn't seem worth the benefit.
findScratchNonCalleeSaveRegister(MachineBasicBlock * MBB,bool HasCall)1104 static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
1105 bool HasCall) {
1106 MachineFunction *MF = MBB->getParent();
1107
1108 // If MBB is an entry block, use X9 as the scratch register
1109 // preserve_none functions may be using X9 to pass arguments,
1110 // so prefer to pick an available register below.
1111 if (&MF->front() == MBB &&
1112 MF->getFunction().getCallingConv() != CallingConv::PreserveNone)
1113 return AArch64::X9;
1114
1115 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1116 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1117 LivePhysRegs LiveRegs(TRI);
1118 getLiveRegsForEntryMBB(LiveRegs, *MBB);
1119 if (HasCall) {
1120 LiveRegs.addReg(AArch64::X16);
1121 LiveRegs.addReg(AArch64::X17);
1122 LiveRegs.addReg(AArch64::X18);
1123 }
1124
1125 // Prefer X9 since it was historically used for the prologue scratch reg.
1126 const MachineRegisterInfo &MRI = MF->getRegInfo();
1127 if (LiveRegs.available(MRI, AArch64::X9))
1128 return AArch64::X9;
1129
1130 for (unsigned Reg : AArch64::GPR64RegClass) {
1131 if (LiveRegs.available(MRI, Reg))
1132 return Reg;
1133 }
1134 return AArch64::NoRegister;
1135 }
1136
canUseAsPrologue(const MachineBasicBlock & MBB) const1137 bool AArch64FrameLowering::canUseAsPrologue(
1138 const MachineBasicBlock &MBB) const {
1139 const MachineFunction *MF = MBB.getParent();
1140 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
1141 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1142 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1143 const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
1144 const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
1145
1146 if (AFI->hasSwiftAsyncContext()) {
1147 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1148 const MachineRegisterInfo &MRI = MF->getRegInfo();
1149 LivePhysRegs LiveRegs(TRI);
1150 getLiveRegsForEntryMBB(LiveRegs, MBB);
1151 // The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are
1152 // available.
1153 if (!LiveRegs.available(MRI, AArch64::X16) ||
1154 !LiveRegs.available(MRI, AArch64::X17))
1155 return false;
1156 }
1157
1158 // Certain stack probing sequences might clobber flags, then we can't use
1159 // the block as a prologue if the flags register is a live-in.
1160 if (MF->getInfo<AArch64FunctionInfo>()->hasStackProbing() &&
1161 MBB.isLiveIn(AArch64::NZCV))
1162 return false;
1163
1164 if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF))
1165 if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister)
1166 return false;
1167
1168 // May need a scratch register (for return value) if require making a special
1169 // call
1170 if (requiresSaveVG(*MF) ||
1171 windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
1172 if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
1173 return false;
1174
1175 return true;
1176 }
1177
needsWinCFI(const MachineFunction & MF)1178 static bool needsWinCFI(const MachineFunction &MF) {
1179 const Function &F = MF.getFunction();
1180 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1181 F.needsUnwindTableEntry();
1182 }
1183
shouldSignReturnAddressEverywhere(const MachineFunction & MF)1184 static bool shouldSignReturnAddressEverywhere(const MachineFunction &MF) {
1185 // FIXME: With WinCFI, extra care should be taken to place SEH_PACSignLR
1186 // and SEH_EpilogEnd instructions in the correct order.
1187 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
1188 return false;
1189 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1190 bool SignReturnAddressAll = AFI->shouldSignReturnAddress(/*SpillsLR=*/false);
1191 return SignReturnAddressAll;
1192 }
1193
shouldCombineCSRLocalStackBump(MachineFunction & MF,uint64_t StackBumpBytes) const1194 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
1195 MachineFunction &MF, uint64_t StackBumpBytes) const {
1196 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1197 const MachineFrameInfo &MFI = MF.getFrameInfo();
1198 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1199 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1200 if (homogeneousPrologEpilog(MF))
1201 return false;
1202
1203 if (AFI->getLocalStackSize() == 0)
1204 return false;
1205
1206 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
1207 // (to force a stp with predecrement) to match the packed unwind format,
1208 // provided that there actually are any callee saved registers to merge the
1209 // decrement with.
1210 // This is potentially marginally slower, but allows using the packed
1211 // unwind format for functions that both have a local area and callee saved
1212 // registers. Using the packed unwind format notably reduces the size of
1213 // the unwind info.
1214 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
1215 MF.getFunction().hasOptSize())
1216 return false;
1217
1218 // 512 is the maximum immediate for stp/ldp that will be used for
1219 // callee-save save/restores
1220 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
1221 return false;
1222
1223 if (MFI.hasVarSizedObjects())
1224 return false;
1225
1226 if (RegInfo->hasStackRealignment(MF))
1227 return false;
1228
1229 // This isn't strictly necessary, but it simplifies things a bit since the
1230 // current RedZone handling code assumes the SP is adjusted by the
1231 // callee-save save/restore code.
1232 if (canUseRedZone(MF))
1233 return false;
1234
1235 // When there is an SVE area on the stack, always allocate the
1236 // callee-saves and spills/locals separately.
1237 if (getSVEStackSize(MF))
1238 return false;
1239
1240 return true;
1241 }
1242
shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock & MBB,uint64_t StackBumpBytes) const1243 bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
1244 MachineBasicBlock &MBB, uint64_t StackBumpBytes) const {
1245 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
1246 return false;
1247 if (MBB.empty())
1248 return true;
1249
1250 // Disable combined SP bump if the last instruction is an MTE tag store. It
1251 // is almost always better to merge SP adjustment into those instructions.
1252 MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
1253 MachineBasicBlock::iterator Begin = MBB.begin();
1254 while (LastI != Begin) {
1255 --LastI;
1256 if (LastI->isTransient())
1257 continue;
1258 if (!LastI->getFlag(MachineInstr::FrameDestroy))
1259 break;
1260 }
1261 switch (LastI->getOpcode()) {
1262 case AArch64::STGloop:
1263 case AArch64::STZGloop:
1264 case AArch64::STGi:
1265 case AArch64::STZGi:
1266 case AArch64::ST2Gi:
1267 case AArch64::STZ2Gi:
1268 return false;
1269 default:
1270 return true;
1271 }
1272 llvm_unreachable("unreachable");
1273 }
1274
1275 // Given a load or a store instruction, generate an appropriate unwinding SEH
1276 // code on Windows.
InsertSEH(MachineBasicBlock::iterator MBBI,const TargetInstrInfo & TII,MachineInstr::MIFlag Flag)1277 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
1278 const TargetInstrInfo &TII,
1279 MachineInstr::MIFlag Flag) {
1280 unsigned Opc = MBBI->getOpcode();
1281 MachineBasicBlock *MBB = MBBI->getParent();
1282 MachineFunction &MF = *MBB->getParent();
1283 DebugLoc DL = MBBI->getDebugLoc();
1284 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1285 int Imm = MBBI->getOperand(ImmIdx).getImm();
1286 MachineInstrBuilder MIB;
1287 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1288 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1289
1290 switch (Opc) {
1291 default:
1292 report_fatal_error("No SEH Opcode for this instruction");
1293 case AArch64::STR_ZXI:
1294 case AArch64::LDR_ZXI: {
1295 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1296 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveZReg))
1297 .addImm(Reg0)
1298 .addImm(Imm)
1299 .setMIFlag(Flag);
1300 break;
1301 }
1302 case AArch64::STR_PXI:
1303 case AArch64::LDR_PXI: {
1304 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1305 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SavePReg))
1306 .addImm(Reg0)
1307 .addImm(Imm)
1308 .setMIFlag(Flag);
1309 break;
1310 }
1311 case AArch64::LDPDpost:
1312 Imm = -Imm;
1313 [[fallthrough]];
1314 case AArch64::STPDpre: {
1315 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1316 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1317 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
1318 .addImm(Reg0)
1319 .addImm(Reg1)
1320 .addImm(Imm * 8)
1321 .setMIFlag(Flag);
1322 break;
1323 }
1324 case AArch64::LDPXpost:
1325 Imm = -Imm;
1326 [[fallthrough]];
1327 case AArch64::STPXpre: {
1328 Register Reg0 = MBBI->getOperand(1).getReg();
1329 Register Reg1 = MBBI->getOperand(2).getReg();
1330 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1331 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1332 .addImm(Imm * 8)
1333 .setMIFlag(Flag);
1334 else
1335 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1336 .addImm(RegInfo->getSEHRegNum(Reg0))
1337 .addImm(RegInfo->getSEHRegNum(Reg1))
1338 .addImm(Imm * 8)
1339 .setMIFlag(Flag);
1340 break;
1341 }
1342 case AArch64::LDRDpost:
1343 Imm = -Imm;
1344 [[fallthrough]];
1345 case AArch64::STRDpre: {
1346 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1347 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1348 .addImm(Reg)
1349 .addImm(Imm)
1350 .setMIFlag(Flag);
1351 break;
1352 }
1353 case AArch64::LDRXpost:
1354 Imm = -Imm;
1355 [[fallthrough]];
1356 case AArch64::STRXpre: {
1357 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1358 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1359 .addImm(Reg)
1360 .addImm(Imm)
1361 .setMIFlag(Flag);
1362 break;
1363 }
1364 case AArch64::STPDi:
1365 case AArch64::LDPDi: {
1366 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1367 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1368 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1369 .addImm(Reg0)
1370 .addImm(Reg1)
1371 .addImm(Imm * 8)
1372 .setMIFlag(Flag);
1373 break;
1374 }
1375 case AArch64::STPXi:
1376 case AArch64::LDPXi: {
1377 Register Reg0 = MBBI->getOperand(0).getReg();
1378 Register Reg1 = MBBI->getOperand(1).getReg();
1379 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1380 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1381 .addImm(Imm * 8)
1382 .setMIFlag(Flag);
1383 else
1384 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1385 .addImm(RegInfo->getSEHRegNum(Reg0))
1386 .addImm(RegInfo->getSEHRegNum(Reg1))
1387 .addImm(Imm * 8)
1388 .setMIFlag(Flag);
1389 break;
1390 }
1391 case AArch64::STRXui:
1392 case AArch64::LDRXui: {
1393 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1394 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1395 .addImm(Reg)
1396 .addImm(Imm * 8)
1397 .setMIFlag(Flag);
1398 break;
1399 }
1400 case AArch64::STRDui:
1401 case AArch64::LDRDui: {
1402 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1403 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1404 .addImm(Reg)
1405 .addImm(Imm * 8)
1406 .setMIFlag(Flag);
1407 break;
1408 }
1409 case AArch64::STPQi:
1410 case AArch64::LDPQi: {
1411 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1412 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1413 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQP))
1414 .addImm(Reg0)
1415 .addImm(Reg1)
1416 .addImm(Imm * 16)
1417 .setMIFlag(Flag);
1418 break;
1419 }
1420 case AArch64::LDPQpost:
1421 Imm = -Imm;
1422 [[fallthrough]];
1423 case AArch64::STPQpre: {
1424 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1425 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1426 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQPX))
1427 .addImm(Reg0)
1428 .addImm(Reg1)
1429 .addImm(Imm * 16)
1430 .setMIFlag(Flag);
1431 break;
1432 }
1433 }
1434 auto I = MBB->insertAfter(MBBI, MIB);
1435 return I;
1436 }
1437
1438 // Fix up the SEH opcode associated with the save/restore instruction.
fixupSEHOpcode(MachineBasicBlock::iterator MBBI,unsigned LocalStackSize)1439 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
1440 unsigned LocalStackSize) {
1441 MachineOperand *ImmOpnd = nullptr;
1442 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1443 switch (MBBI->getOpcode()) {
1444 default:
1445 llvm_unreachable("Fix the offset in the SEH instruction");
1446 case AArch64::SEH_SaveFPLR:
1447 case AArch64::SEH_SaveRegP:
1448 case AArch64::SEH_SaveReg:
1449 case AArch64::SEH_SaveFRegP:
1450 case AArch64::SEH_SaveFReg:
1451 case AArch64::SEH_SaveAnyRegQP:
1452 case AArch64::SEH_SaveAnyRegQPX:
1453 ImmOpnd = &MBBI->getOperand(ImmIdx);
1454 break;
1455 }
1456 if (ImmOpnd)
1457 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1458 }
1459
requiresGetVGCall(MachineFunction & MF)1460 bool requiresGetVGCall(MachineFunction &MF) {
1461 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1462 return AFI->hasStreamingModeChanges() &&
1463 !MF.getSubtarget<AArch64Subtarget>().hasSVE();
1464 }
1465
requiresSaveVG(const MachineFunction & MF)1466 static bool requiresSaveVG(const MachineFunction &MF) {
1467 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1468 // For Darwin platforms we don't save VG for non-SVE functions, even if SME
1469 // is enabled with streaming mode changes.
1470 if (!AFI->hasStreamingModeChanges())
1471 return false;
1472 auto &ST = MF.getSubtarget<AArch64Subtarget>();
1473 if (ST.isTargetDarwin())
1474 return ST.hasSVE();
1475 return true;
1476 }
1477
isVGInstruction(MachineBasicBlock::iterator MBBI)1478 bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
1479 unsigned Opc = MBBI->getOpcode();
1480 if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
1481 Opc == AArch64::UBFMXri)
1482 return true;
1483
1484 if (requiresGetVGCall(*MBBI->getMF())) {
1485 if (Opc == AArch64::ORRXrr)
1486 return true;
1487
1488 if (Opc == AArch64::BL) {
1489 auto Op1 = MBBI->getOperand(0);
1490 return Op1.isSymbol() &&
1491 (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
1492 }
1493 }
1494
1495 return false;
1496 }
1497
1498 // Convert callee-save register save/restore instruction to do stack pointer
1499 // decrement/increment to allocate/deallocate the callee-save stack area by
1500 // converting store/load to use pre/post increment version.
convertCalleeSaveRestoreToSPPrePostIncDec(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,const TargetInstrInfo * TII,int CSStackSizeInc,bool NeedsWinCFI,bool * HasWinCFI,bool EmitCFI,MachineInstr::MIFlag FrameFlag=MachineInstr::FrameSetup,int CFAOffset=0)1501 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1502 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
1503 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1504 bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1505 MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
1506 int CFAOffset = 0) {
1507 unsigned NewOpc;
1508
1509 // If the function contains streaming mode changes, we expect instructions
1510 // to calculate the value of VG before spilling. For locally-streaming
1511 // functions, we need to do this for both the streaming and non-streaming
1512 // vector length. Move past these instructions if necessary.
1513 MachineFunction &MF = *MBB.getParent();
1514 if (requiresSaveVG(MF))
1515 while (isVGInstruction(MBBI))
1516 ++MBBI;
1517
1518 switch (MBBI->getOpcode()) {
1519 default:
1520 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1521 case AArch64::STPXi:
1522 NewOpc = AArch64::STPXpre;
1523 break;
1524 case AArch64::STPDi:
1525 NewOpc = AArch64::STPDpre;
1526 break;
1527 case AArch64::STPQi:
1528 NewOpc = AArch64::STPQpre;
1529 break;
1530 case AArch64::STRXui:
1531 NewOpc = AArch64::STRXpre;
1532 break;
1533 case AArch64::STRDui:
1534 NewOpc = AArch64::STRDpre;
1535 break;
1536 case AArch64::STRQui:
1537 NewOpc = AArch64::STRQpre;
1538 break;
1539 case AArch64::LDPXi:
1540 NewOpc = AArch64::LDPXpost;
1541 break;
1542 case AArch64::LDPDi:
1543 NewOpc = AArch64::LDPDpost;
1544 break;
1545 case AArch64::LDPQi:
1546 NewOpc = AArch64::LDPQpost;
1547 break;
1548 case AArch64::LDRXui:
1549 NewOpc = AArch64::LDRXpost;
1550 break;
1551 case AArch64::LDRDui:
1552 NewOpc = AArch64::LDRDpost;
1553 break;
1554 case AArch64::LDRQui:
1555 NewOpc = AArch64::LDRQpost;
1556 break;
1557 }
1558 TypeSize Scale = TypeSize::getFixed(1), Width = TypeSize::getFixed(0);
1559 int64_t MinOffset, MaxOffset;
1560 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1561 NewOpc, Scale, Width, MinOffset, MaxOffset);
1562 (void)Success;
1563 assert(Success && "unknown load/store opcode");
1564
1565 // If the first store isn't right where we want SP then we can't fold the
1566 // update in so create a normal arithmetic instruction instead.
1567 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1568 CSStackSizeInc < MinOffset * (int64_t)Scale.getFixedValue() ||
1569 CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) {
1570 // If we are destroying the frame, make sure we add the increment after the
1571 // last frame operation.
1572 if (FrameFlag == MachineInstr::FrameDestroy) {
1573 ++MBBI;
1574 // Also skip the SEH instruction, if needed
1575 if (NeedsWinCFI && AArch64InstrInfo::isSEHInstruction(*MBBI))
1576 ++MBBI;
1577 }
1578 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1579 StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1580 false, NeedsWinCFI, HasWinCFI, EmitCFI,
1581 StackOffset::getFixed(CFAOffset));
1582
1583 return std::prev(MBBI);
1584 }
1585
1586 // Get rid of the SEH code associated with the old instruction.
1587 if (NeedsWinCFI) {
1588 auto SEH = std::next(MBBI);
1589 if (AArch64InstrInfo::isSEHInstruction(*SEH))
1590 SEH->eraseFromParent();
1591 }
1592
1593 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1594 MIB.addReg(AArch64::SP, RegState::Define);
1595
1596 // Copy all operands other than the immediate offset.
1597 unsigned OpndIdx = 0;
1598 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1599 ++OpndIdx)
1600 MIB.add(MBBI->getOperand(OpndIdx));
1601
1602 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1603 "Unexpected immediate offset in first/last callee-save save/restore "
1604 "instruction!");
1605 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1606 "Unexpected base register in callee-save save/restore instruction!");
1607 assert(CSStackSizeInc % Scale == 0);
1608 MIB.addImm(CSStackSizeInc / (int)Scale);
1609
1610 MIB.setMIFlags(MBBI->getFlags());
1611 MIB.setMemRefs(MBBI->memoperands());
1612
1613 // Generate a new SEH code that corresponds to the new instruction.
1614 if (NeedsWinCFI) {
1615 *HasWinCFI = true;
1616 InsertSEH(*MIB, *TII, FrameFlag);
1617 }
1618
1619 if (EmitCFI)
1620 CFIInstBuilder(MBB, MBBI, FrameFlag)
1621 .buildDefCFAOffset(CFAOffset - CSStackSizeInc);
1622
1623 return std::prev(MBB.erase(MBBI));
1624 }
1625
1626 // Fixup callee-save register save/restore instructions to take into account
1627 // combined SP bump by adding the local stack size to the stack offsets.
fixupCalleeSaveRestoreStackOffset(MachineInstr & MI,uint64_t LocalStackSize,bool NeedsWinCFI,bool * HasWinCFI)1628 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
1629 uint64_t LocalStackSize,
1630 bool NeedsWinCFI,
1631 bool *HasWinCFI) {
1632 if (AArch64InstrInfo::isSEHInstruction(MI))
1633 return;
1634
1635 unsigned Opc = MI.getOpcode();
1636 unsigned Scale;
1637 switch (Opc) {
1638 case AArch64::STPXi:
1639 case AArch64::STRXui:
1640 case AArch64::STPDi:
1641 case AArch64::STRDui:
1642 case AArch64::LDPXi:
1643 case AArch64::LDRXui:
1644 case AArch64::LDPDi:
1645 case AArch64::LDRDui:
1646 Scale = 8;
1647 break;
1648 case AArch64::STPQi:
1649 case AArch64::STRQui:
1650 case AArch64::LDPQi:
1651 case AArch64::LDRQui:
1652 Scale = 16;
1653 break;
1654 default:
1655 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1656 }
1657
1658 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1659 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1660 "Unexpected base register in callee-save save/restore instruction!");
1661 // Last operand is immediate offset that needs fixing.
1662 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1663 // All generated opcodes have scaled offsets.
1664 assert(LocalStackSize % Scale == 0);
1665 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1666
1667 if (NeedsWinCFI) {
1668 *HasWinCFI = true;
1669 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1670 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1671 assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
1672 "Expecting a SEH instruction");
1673 fixupSEHOpcode(MBBI, LocalStackSize);
1674 }
1675 }
1676
isTargetWindows(const MachineFunction & MF)1677 static bool isTargetWindows(const MachineFunction &MF) {
1678 return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
1679 }
1680
getStackHazardSize(const MachineFunction & MF)1681 static unsigned getStackHazardSize(const MachineFunction &MF) {
1682 return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
1683 }
1684
1685 // Convenience function to determine whether I is an SVE callee save.
IsSVECalleeSave(MachineBasicBlock::iterator I)1686 static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1687 switch (I->getOpcode()) {
1688 default:
1689 return false;
1690 case AArch64::PTRUE_C_B:
1691 case AArch64::LD1B_2Z_IMM:
1692 case AArch64::ST1B_2Z_IMM:
1693 case AArch64::STR_ZXI:
1694 case AArch64::STR_PXI:
1695 case AArch64::LDR_ZXI:
1696 case AArch64::LDR_PXI:
1697 case AArch64::PTRUE_B:
1698 case AArch64::CPY_ZPzI_B:
1699 case AArch64::CMPNE_PPzZI_B:
1700 return I->getFlag(MachineInstr::FrameSetup) ||
1701 I->getFlag(MachineInstr::FrameDestroy);
1702 case AArch64::SEH_SavePReg:
1703 case AArch64::SEH_SaveZReg:
1704 return true;
1705 }
1706 }
1707
emitShadowCallStackPrologue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool NeedsWinCFI,bool NeedsUnwindInfo)1708 static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
1709 MachineFunction &MF,
1710 MachineBasicBlock &MBB,
1711 MachineBasicBlock::iterator MBBI,
1712 const DebugLoc &DL, bool NeedsWinCFI,
1713 bool NeedsUnwindInfo) {
1714 // Shadow call stack prolog: str x30, [x18], #8
1715 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1716 .addReg(AArch64::X18, RegState::Define)
1717 .addReg(AArch64::LR)
1718 .addReg(AArch64::X18)
1719 .addImm(8)
1720 .setMIFlag(MachineInstr::FrameSetup);
1721
1722 // This instruction also makes x18 live-in to the entry block.
1723 MBB.addLiveIn(AArch64::X18);
1724
1725 if (NeedsWinCFI)
1726 BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1727 .setMIFlag(MachineInstr::FrameSetup);
1728
1729 if (NeedsUnwindInfo) {
1730 // Emit a CFI instruction that causes 8 to be subtracted from the value of
1731 // x18 when unwinding past this frame.
1732 static const char CFIInst[] = {
1733 dwarf::DW_CFA_val_expression,
1734 18, // register
1735 2, // length
1736 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1737 static_cast<char>(-8) & 0x7f, // addend (sleb128)
1738 };
1739 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
1740 .buildEscape(StringRef(CFIInst, sizeof(CFIInst)));
1741 }
1742 }
1743
emitShadowCallStackEpilogue(const TargetInstrInfo & TII,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool NeedsWinCFI)1744 static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII,
1745 MachineFunction &MF,
1746 MachineBasicBlock &MBB,
1747 MachineBasicBlock::iterator MBBI,
1748 const DebugLoc &DL, bool NeedsWinCFI) {
1749 // Shadow call stack epilog: ldr x30, [x18, #-8]!
1750 BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1751 .addReg(AArch64::X18, RegState::Define)
1752 .addReg(AArch64::LR, RegState::Define)
1753 .addReg(AArch64::X18)
1754 .addImm(-8)
1755 .setMIFlag(MachineInstr::FrameDestroy);
1756
1757 if (NeedsWinCFI)
1758 BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1759 .setMIFlag(MachineInstr::FrameDestroy);
1760
1761 if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF))
1762 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy)
1763 .buildRestore(AArch64::X18);
1764 }
1765
1766 // Define the current CFA rule to use the provided FP.
emitDefineCFAWithFP(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,unsigned FixedObject)1767 static void emitDefineCFAWithFP(MachineFunction &MF, MachineBasicBlock &MBB,
1768 MachineBasicBlock::iterator MBBI,
1769 unsigned FixedObject) {
1770 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
1771 const AArch64RegisterInfo *TRI = STI.getRegisterInfo();
1772 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1773
1774 const int OffsetToFirstCalleeSaveFromFP =
1775 AFI->getCalleeSaveBaseToFrameRecordOffset() -
1776 AFI->getCalleeSavedStackSize();
1777 Register FramePtr = TRI->getFrameRegister(MF);
1778 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
1779 .buildDefCFA(FramePtr, FixedObject - OffsetToFirstCalleeSaveFromFP);
1780 }
1781
1782 #ifndef NDEBUG
1783 /// Collect live registers from the end of \p MI's parent up to (including) \p
1784 /// MI in \p LiveRegs.
getLivePhysRegsUpTo(MachineInstr & MI,const TargetRegisterInfo & TRI,LivePhysRegs & LiveRegs)1785 static void getLivePhysRegsUpTo(MachineInstr &MI, const TargetRegisterInfo &TRI,
1786 LivePhysRegs &LiveRegs) {
1787
1788 MachineBasicBlock &MBB = *MI.getParent();
1789 LiveRegs.addLiveOuts(MBB);
1790 for (const MachineInstr &MI :
1791 reverse(make_range(MI.getIterator(), MBB.instr_end())))
1792 LiveRegs.stepBackward(MI);
1793 }
1794 #endif
1795
emitPacRetPlusLeafHardening(MachineFunction & MF) const1796 void AArch64FrameLowering::emitPacRetPlusLeafHardening(
1797 MachineFunction &MF) const {
1798 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1799 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1800
1801 auto EmitSignRA = [&](MachineBasicBlock &MBB) {
1802 DebugLoc DL; // Set debug location to unknown.
1803 MachineBasicBlock::iterator MBBI = MBB.begin();
1804
1805 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
1806 .setMIFlag(MachineInstr::FrameSetup);
1807 };
1808
1809 auto EmitAuthRA = [&](MachineBasicBlock &MBB) {
1810 DebugLoc DL;
1811 MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
1812 if (MBBI != MBB.end())
1813 DL = MBBI->getDebugLoc();
1814
1815 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_EPILOGUE))
1816 .setMIFlag(MachineInstr::FrameDestroy);
1817 };
1818
1819 // This should be in sync with PEIImpl::calculateSaveRestoreBlocks.
1820 EmitSignRA(MF.front());
1821 for (MachineBasicBlock &MBB : MF) {
1822 if (MBB.isEHFuncletEntry())
1823 EmitSignRA(MBB);
1824 if (MBB.isReturnBlock())
1825 EmitAuthRA(MBB);
1826 }
1827 }
1828
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1829 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
1830 MachineBasicBlock &MBB) const {
1831 MachineBasicBlock::iterator MBBI = MBB.begin();
1832 const MachineFrameInfo &MFI = MF.getFrameInfo();
1833 const Function &F = MF.getFunction();
1834 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1835 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1836 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1837
1838 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1839 bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
1840 bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
1841 bool HasFP = hasFP(MF);
1842 bool NeedsWinCFI = needsWinCFI(MF);
1843 bool HasWinCFI = false;
1844 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1845
1846 MachineBasicBlock::iterator End = MBB.end();
1847 #ifndef NDEBUG
1848 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1849 // Collect live register from the end of MBB up to the start of the existing
1850 // frame setup instructions.
1851 MachineBasicBlock::iterator NonFrameStart = MBB.begin();
1852 while (NonFrameStart != End &&
1853 NonFrameStart->getFlag(MachineInstr::FrameSetup))
1854 ++NonFrameStart;
1855
1856 LivePhysRegs LiveRegs(*TRI);
1857 if (NonFrameStart != MBB.end()) {
1858 getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
1859 // Ignore registers used for stack management for now.
1860 LiveRegs.removeReg(AArch64::SP);
1861 LiveRegs.removeReg(AArch64::X19);
1862 LiveRegs.removeReg(AArch64::FP);
1863 LiveRegs.removeReg(AArch64::LR);
1864
1865 // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
1866 // This is necessary to spill VG if required where SVE is unavailable, but
1867 // X0 is preserved around this call.
1868 if (requiresGetVGCall(MF))
1869 LiveRegs.removeReg(AArch64::X0);
1870 }
1871
1872 auto VerifyClobberOnExit = make_scope_exit([&]() {
1873 if (NonFrameStart == MBB.end())
1874 return;
1875 // Check if any of the newly instructions clobber any of the live registers.
1876 for (MachineInstr &MI :
1877 make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
1878 for (auto &Op : MI.operands())
1879 if (Op.isReg() && Op.isDef())
1880 assert(!LiveRegs.contains(Op.getReg()) &&
1881 "live register clobbered by inserted prologue instructions");
1882 }
1883 });
1884 #endif
1885
1886 bool IsFunclet = MBB.isEHFuncletEntry();
1887
1888 // At this point, we're going to decide whether or not the function uses a
1889 // redzone. In most cases, the function doesn't have a redzone so let's
1890 // assume that's false and set it to true in the case that there's a redzone.
1891 AFI->setHasRedZone(false);
1892
1893 // Debug location must be unknown since the first debug location is used
1894 // to determine the end of the prologue.
1895 DebugLoc DL;
1896
1897 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1898 if (MFnI.shouldSignReturnAddress(MF)) {
1899 // If pac-ret+leaf is in effect, PAUTH_PROLOGUE pseudo instructions
1900 // are inserted by emitPacRetPlusLeafHardening().
1901 if (!shouldSignReturnAddressEverywhere(MF)) {
1902 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
1903 .setMIFlag(MachineInstr::FrameSetup);
1904 }
1905 // AArch64PointerAuth pass will insert SEH_PACSignLR
1906 HasWinCFI |= NeedsWinCFI;
1907 }
1908
1909 if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) {
1910 emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1911 MFnI.needsDwarfUnwindInfo(MF));
1912 HasWinCFI |= NeedsWinCFI;
1913 }
1914
1915 if (EmitCFI && MFnI.isMTETagged()) {
1916 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
1917 .setMIFlag(MachineInstr::FrameSetup);
1918 }
1919
1920 // We signal the presence of a Swift extended frame to external tools by
1921 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1922 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1923 // bits so that is still true.
1924 if (HasFP && AFI->hasSwiftAsyncContext()) {
1925 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
1926 case SwiftAsyncFramePointerMode::DeploymentBased:
1927 if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1928 // The special symbol below is absolute and has a *value* that can be
1929 // combined with the frame pointer to signal an extended frame.
1930 BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1931 .addExternalSymbol("swift_async_extendedFramePointerFlags",
1932 AArch64II::MO_GOT);
1933 if (NeedsWinCFI) {
1934 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1935 .setMIFlags(MachineInstr::FrameSetup);
1936 HasWinCFI = true;
1937 }
1938 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1939 .addUse(AArch64::FP)
1940 .addUse(AArch64::X16)
1941 .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1942 if (NeedsWinCFI) {
1943 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1944 .setMIFlags(MachineInstr::FrameSetup);
1945 HasWinCFI = true;
1946 }
1947 break;
1948 }
1949 [[fallthrough]];
1950
1951 case SwiftAsyncFramePointerMode::Always:
1952 // ORR x29, x29, #0x1000_0000_0000_0000
1953 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1954 .addUse(AArch64::FP)
1955 .addImm(0x1100)
1956 .setMIFlag(MachineInstr::FrameSetup);
1957 if (NeedsWinCFI) {
1958 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1959 .setMIFlags(MachineInstr::FrameSetup);
1960 HasWinCFI = true;
1961 }
1962 break;
1963
1964 case SwiftAsyncFramePointerMode::Never:
1965 break;
1966 }
1967 }
1968
1969 // All calls are tail calls in GHC calling conv, and functions have no
1970 // prologue/epilogue.
1971 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
1972 return;
1973
1974 // Set tagged base pointer to the requested stack slot.
1975 // Ideally it should match SP value after prologue.
1976 std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1977 if (TBPI)
1978 AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
1979 else
1980 AFI->setTaggedBasePointerOffset(MFI.getStackSize());
1981
1982 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1983
1984 // getStackSize() includes all the locals in its size calculation. We don't
1985 // include these locals when computing the stack size of a funclet, as they
1986 // are allocated in the parent's stack frame and accessed via the frame
1987 // pointer from the funclet. We only save the callee saved registers in the
1988 // funclet, which are really the callee saved registers of the parent
1989 // function, including the funclet.
1990 int64_t NumBytes =
1991 IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
1992 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1993 assert(!HasFP && "unexpected function without stack frame but with FP");
1994 assert(!SVEStackSize &&
1995 "unexpected function without stack frame but with SVE objects");
1996 // All of the stack allocation is for locals.
1997 AFI->setLocalStackSize(NumBytes);
1998 if (!NumBytes) {
1999 if (NeedsWinCFI && HasWinCFI) {
2000 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2001 .setMIFlag(MachineInstr::FrameSetup);
2002 }
2003 return;
2004 }
2005 // REDZONE: If the stack size is less than 128 bytes, we don't need
2006 // to actually allocate.
2007 if (canUseRedZone(MF)) {
2008 AFI->setHasRedZone(true);
2009 ++NumRedZoneFunctions;
2010 } else {
2011 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
2012 StackOffset::getFixed(-NumBytes), TII,
2013 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
2014 if (EmitCFI) {
2015 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
2016 MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
2017 // Encode the stack size of the leaf function.
2018 CFIInstBuilder(MBB, MBBI, MachineInstr::FrameSetup)
2019 .buildDefCFAOffset(NumBytes, FrameLabel);
2020 }
2021 }
2022
2023 if (NeedsWinCFI) {
2024 HasWinCFI = true;
2025 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2026 .setMIFlag(MachineInstr::FrameSetup);
2027 }
2028
2029 return;
2030 }
2031
2032 bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
2033 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
2034
2035 // Windows unwind can't represent the required stack adjustments if we have
2036 // both SVE callee-saves and dynamic stack allocations, and the frame
2037 // pointer is before the SVE spills. The allocation of the frame pointer
2038 // must be the last instruction in the prologue so the unwinder can restore
2039 // the stack pointer correctly. (And there isn't any unwind opcode for
2040 // `addvl sp, x29, -17`.)
2041 //
2042 // Because of this, we do spills in the opposite order on Windows: first SVE,
2043 // then GPRs. The main side-effect of this is that it makes accessing
2044 // parameters passed on the stack more expensive.
2045 //
2046 // We could consider rearranging the spills for simpler cases.
2047 bool FPAfterSVECalleeSaves =
2048 Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
2049
2050 if (FPAfterSVECalleeSaves && AFI->hasStackHazardSlotIndex())
2051 reportFatalUsageError("SME hazard padding is not supported on Windows");
2052
2053 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
2054 // All of the remaining stack allocations are for locals.
2055 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
2056 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
2057 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
2058 if (FPAfterSVECalleeSaves) {
2059 // If we're doing SVE saves first, we need to immediately allocate space
2060 // for fixed objects, then space for the SVE callee saves.
2061 //
2062 // Windows unwind requires that the scalable size is a multiple of 16;
2063 // that's handled when the callee-saved size is computed.
2064 auto SaveSize =
2065 StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()) +
2066 StackOffset::getFixed(FixedObject);
2067 allocateStackSpace(MBB, MBBI, 0, SaveSize, NeedsWinCFI, &HasWinCFI,
2068 /*EmitCFI=*/false, StackOffset{},
2069 /*FollowupAllocs=*/true);
2070 NumBytes -= FixedObject;
2071
2072 // Now allocate space for the GPR callee saves.
2073 while (MBBI != End && IsSVECalleeSave(MBBI))
2074 ++MBBI;
2075 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
2076 MBB, MBBI, DL, TII, -AFI->getCalleeSavedStackSize(), NeedsWinCFI,
2077 &HasWinCFI, EmitAsyncCFI);
2078 NumBytes -= AFI->getCalleeSavedStackSize();
2079 } else if (CombineSPBump) {
2080 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2081 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
2082 StackOffset::getFixed(-NumBytes), TII,
2083 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
2084 EmitAsyncCFI);
2085 NumBytes = 0;
2086 } else if (HomPrologEpilog) {
2087 // Stack has been already adjusted.
2088 NumBytes -= PrologueSaveSize;
2089 } else if (PrologueSaveSize != 0) {
2090 MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
2091 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
2092 EmitAsyncCFI);
2093 NumBytes -= PrologueSaveSize;
2094 }
2095 assert(NumBytes >= 0 && "Negative stack allocation size!?");
2096
2097 // Move past the saves of the callee-saved registers, fixing up the offsets
2098 // and pre-inc if we decided to combine the callee-save and local stack
2099 // pointer bump above.
2100 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
2101 !IsSVECalleeSave(MBBI)) {
2102 if (CombineSPBump &&
2103 // Only fix-up frame-setup load/store instructions.
2104 (!requiresSaveVG(MF) || !isVGInstruction(MBBI)))
2105 fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
2106 NeedsWinCFI, &HasWinCFI);
2107 ++MBBI;
2108 }
2109
2110 // For funclets the FP belongs to the containing function.
2111 if (!IsFunclet && HasFP) {
2112 // Only set up FP if we actually need to.
2113 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
2114
2115 if (CombineSPBump)
2116 FPOffset += AFI->getLocalStackSize();
2117
2118 if (AFI->hasSwiftAsyncContext()) {
2119 // Before we update the live FP we have to ensure there's a valid (or
2120 // null) asynchronous context in its slot just before FP in the frame
2121 // record, so store it now.
2122 const auto &Attrs = MF.getFunction().getAttributes();
2123 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
2124 if (HaveInitialContext)
2125 MBB.addLiveIn(AArch64::X22);
2126 Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
2127 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
2128 .addUse(Reg)
2129 .addUse(AArch64::SP)
2130 .addImm(FPOffset - 8)
2131 .setMIFlags(MachineInstr::FrameSetup);
2132 if (NeedsWinCFI) {
2133 // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
2134 // to multiple instructions, should be mutually-exclusive.
2135 assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
2136 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2137 .setMIFlags(MachineInstr::FrameSetup);
2138 HasWinCFI = true;
2139 }
2140 }
2141
2142 if (HomPrologEpilog) {
2143 auto Prolog = MBBI;
2144 --Prolog;
2145 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
2146 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
2147 } else {
2148 // Issue sub fp, sp, FPOffset or
2149 // mov fp,sp when FPOffset is zero.
2150 // Note: All stores of callee-saved registers are marked as "FrameSetup".
2151 // This code marks the instruction(s) that set the FP also.
2152 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
2153 StackOffset::getFixed(FPOffset), TII,
2154 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
2155 if (NeedsWinCFI && HasWinCFI) {
2156 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2157 .setMIFlag(MachineInstr::FrameSetup);
2158 // After setting up the FP, the rest of the prolog doesn't need to be
2159 // included in the SEH unwind info.
2160 NeedsWinCFI = false;
2161 }
2162 }
2163 if (EmitAsyncCFI)
2164 emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
2165 }
2166
2167 // Now emit the moves for whatever callee saved regs we have (including FP,
2168 // LR if those are saved). Frame instructions for SVE register are emitted
2169 // later, after the instruction which actually save SVE regs.
2170 if (EmitAsyncCFI)
2171 emitCalleeSavedGPRLocations(MBB, MBBI);
2172
2173 // Alignment is required for the parent frame, not the funclet
2174 const bool NeedsRealignment =
2175 NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
2176 const int64_t RealignmentPadding =
2177 (NeedsRealignment && MFI.getMaxAlign() > Align(16))
2178 ? MFI.getMaxAlign().value() - 16
2179 : 0;
2180
2181 if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
2182 if (AFI->getSVECalleeSavedStackSize())
2183 report_fatal_error(
2184 "SVE callee saves not yet supported with stack probing");
2185
2186 // Find an available register to spill the value of X15 to, if X15 is being
2187 // used already for nest.
2188 unsigned X15Scratch = AArch64::NoRegister;
2189 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
2190 if (llvm::any_of(MBB.liveins(),
2191 [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
2192 return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
2193 AArch64::X15, LiveIn.PhysReg);
2194 })) {
2195 X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
2196 assert(X15Scratch != AArch64::NoRegister &&
2197 (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
2198 #ifndef NDEBUG
2199 LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
2200 #endif
2201 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
2202 .addReg(AArch64::XZR)
2203 .addReg(AArch64::X15, RegState::Undef)
2204 .addReg(AArch64::X15, RegState::Implicit)
2205 .setMIFlag(MachineInstr::FrameSetup);
2206 }
2207
2208 uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
2209 if (NeedsWinCFI) {
2210 HasWinCFI = true;
2211 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
2212 // exceed this amount. We need to move at most 2^24 - 1 into x15.
2213 // This is at most two instructions, MOVZ followed by MOVK.
2214 // TODO: Fix to use multiple stack alloc unwind codes for stacks
2215 // exceeding 256MB in size.
2216 if (NumBytes >= (1 << 28))
2217 report_fatal_error("Stack size cannot exceed 256MB for stack "
2218 "unwinding purposes");
2219
2220 uint32_t LowNumWords = NumWords & 0xFFFF;
2221 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
2222 .addImm(LowNumWords)
2223 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
2224 .setMIFlag(MachineInstr::FrameSetup);
2225 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2226 .setMIFlag(MachineInstr::FrameSetup);
2227 if ((NumWords & 0xFFFF0000) != 0) {
2228 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
2229 .addReg(AArch64::X15)
2230 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
2231 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
2232 .setMIFlag(MachineInstr::FrameSetup);
2233 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2234 .setMIFlag(MachineInstr::FrameSetup);
2235 }
2236 } else {
2237 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
2238 .addImm(NumWords)
2239 .setMIFlags(MachineInstr::FrameSetup);
2240 }
2241
2242 const char *ChkStk = Subtarget.getChkStkName();
2243 switch (MF.getTarget().getCodeModel()) {
2244 case CodeModel::Tiny:
2245 case CodeModel::Small:
2246 case CodeModel::Medium:
2247 case CodeModel::Kernel:
2248 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
2249 .addExternalSymbol(ChkStk)
2250 .addReg(AArch64::X15, RegState::Implicit)
2251 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
2252 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
2253 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
2254 .setMIFlags(MachineInstr::FrameSetup);
2255 if (NeedsWinCFI) {
2256 HasWinCFI = true;
2257 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2258 .setMIFlag(MachineInstr::FrameSetup);
2259 }
2260 break;
2261 case CodeModel::Large:
2262 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
2263 .addReg(AArch64::X16, RegState::Define)
2264 .addExternalSymbol(ChkStk)
2265 .addExternalSymbol(ChkStk)
2266 .setMIFlags(MachineInstr::FrameSetup);
2267 if (NeedsWinCFI) {
2268 HasWinCFI = true;
2269 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2270 .setMIFlag(MachineInstr::FrameSetup);
2271 }
2272
2273 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
2274 .addReg(AArch64::X16, RegState::Kill)
2275 .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
2276 .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
2277 .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
2278 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
2279 .setMIFlags(MachineInstr::FrameSetup);
2280 if (NeedsWinCFI) {
2281 HasWinCFI = true;
2282 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2283 .setMIFlag(MachineInstr::FrameSetup);
2284 }
2285 break;
2286 }
2287
2288 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
2289 .addReg(AArch64::SP, RegState::Kill)
2290 .addReg(AArch64::X15, RegState::Kill)
2291 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
2292 .setMIFlags(MachineInstr::FrameSetup);
2293 if (NeedsWinCFI) {
2294 HasWinCFI = true;
2295 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2296 .addImm(NumBytes)
2297 .setMIFlag(MachineInstr::FrameSetup);
2298 }
2299 NumBytes = 0;
2300
2301 if (RealignmentPadding > 0) {
2302 if (RealignmentPadding >= 4096) {
2303 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
2304 .addReg(AArch64::X16, RegState::Define)
2305 .addImm(RealignmentPadding)
2306 .setMIFlags(MachineInstr::FrameSetup);
2307 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
2308 .addReg(AArch64::SP)
2309 .addReg(AArch64::X16, RegState::Kill)
2310 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
2311 .setMIFlag(MachineInstr::FrameSetup);
2312 } else {
2313 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
2314 .addReg(AArch64::SP)
2315 .addImm(RealignmentPadding)
2316 .addImm(0)
2317 .setMIFlag(MachineInstr::FrameSetup);
2318 }
2319
2320 uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
2321 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
2322 .addReg(AArch64::X15, RegState::Kill)
2323 .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
2324 AFI->setStackRealigned(true);
2325
2326 // No need for SEH instructions here; if we're realigning the stack,
2327 // we've set a frame pointer and already finished the SEH prologue.
2328 assert(!NeedsWinCFI);
2329 }
2330 if (X15Scratch != AArch64::NoRegister) {
2331 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
2332 .addReg(AArch64::XZR)
2333 .addReg(X15Scratch, RegState::Undef)
2334 .addReg(X15Scratch, RegState::Implicit)
2335 .setMIFlag(MachineInstr::FrameSetup);
2336 }
2337 }
2338
2339 StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
2340 MachineBasicBlock::iterator CalleeSavesEnd = MBBI;
2341
2342 StackOffset CFAOffset =
2343 StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
2344
2345 // Process the SVE callee-saves to determine what space needs to be
2346 // allocated.
2347 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2348 LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
2349 << "\n");
2350 SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
2351 SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
2352 // Find callee save instructions in frame.
2353 // Note: With FPAfterSVECalleeSaves the callee saves have already been
2354 // allocated.
2355 if (!FPAfterSVECalleeSaves) {
2356 MachineBasicBlock::iterator CalleeSavesBegin = MBBI;
2357 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
2358 while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
2359 ++MBBI;
2360 CalleeSavesEnd = MBBI;
2361
2362 StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
2363 // Allocate space for the callee saves (if any).
2364 allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
2365 nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
2366 MFI.hasVarSizedObjects() || LocalsSize);
2367 }
2368 }
2369 CFAOffset += SVECalleeSavesSize;
2370
2371 if (EmitAsyncCFI)
2372 emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
2373
2374 // Allocate space for the rest of the frame including SVE locals. Align the
2375 // stack as necessary.
2376 assert(!(canUseRedZone(MF) && NeedsRealignment) &&
2377 "Cannot use redzone with stack realignment");
2378 if (!canUseRedZone(MF)) {
2379 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
2380 // the correct value here, as NumBytes also includes padding bytes,
2381 // which shouldn't be counted here.
2382 allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
2383 SVELocalsSize + StackOffset::getFixed(NumBytes),
2384 NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
2385 CFAOffset, MFI.hasVarSizedObjects());
2386 }
2387
2388 // If we need a base pointer, set it up here. It's whatever the value of the
2389 // stack pointer is at this point. Any variable size objects will be allocated
2390 // after this, so we can still use the base pointer to reference locals.
2391 //
2392 // FIXME: Clarify FrameSetup flags here.
2393 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
2394 // needed.
2395 // For funclets the BP belongs to the containing function.
2396 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
2397 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
2398 false);
2399 if (NeedsWinCFI) {
2400 HasWinCFI = true;
2401 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2402 .setMIFlag(MachineInstr::FrameSetup);
2403 }
2404 }
2405
2406 // The very last FrameSetup instruction indicates the end of prologue. Emit a
2407 // SEH opcode indicating the prologue end.
2408 if (NeedsWinCFI && HasWinCFI) {
2409 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2410 .setMIFlag(MachineInstr::FrameSetup);
2411 }
2412
2413 // SEH funclets are passed the frame pointer in X1. If the parent
2414 // function uses the base register, then the base register is used
2415 // directly, and is not retrieved from X1.
2416 if (IsFunclet && F.hasPersonalityFn()) {
2417 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
2418 if (isAsynchronousEHPersonality(Per)) {
2419 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
2420 .addReg(AArch64::X1)
2421 .setMIFlag(MachineInstr::FrameSetup);
2422 MBB.addLiveIn(AArch64::X1);
2423 }
2424 }
2425
2426 if (EmitCFI && !EmitAsyncCFI) {
2427 if (HasFP) {
2428 emitDefineCFAWithFP(MF, MBB, MBBI, FixedObject);
2429 } else {
2430 StackOffset TotalSize =
2431 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
2432 CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
2433 CFIBuilder.insertCFIInst(
2434 createDefCFA(*RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP,
2435 TotalSize, /*LastAdjustmentWasScalable=*/false));
2436 }
2437 emitCalleeSavedGPRLocations(MBB, MBBI);
2438 emitCalleeSavedSVELocations(MBB, MBBI);
2439 }
2440 }
2441
isFuncletReturnInstr(const MachineInstr & MI)2442 static bool isFuncletReturnInstr(const MachineInstr &MI) {
2443 switch (MI.getOpcode()) {
2444 default:
2445 return false;
2446 case AArch64::CATCHRET:
2447 case AArch64::CLEANUPRET:
2448 return true;
2449 }
2450 }
2451
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const2452 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
2453 MachineBasicBlock &MBB) const {
2454 MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
2455 MachineFrameInfo &MFI = MF.getFrameInfo();
2456 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
2457 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2458 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
2459 DebugLoc DL;
2460 bool NeedsWinCFI = needsWinCFI(MF);
2461 bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
2462 bool HasWinCFI = false;
2463 bool IsFunclet = false;
2464
2465 if (MBB.end() != MBBI) {
2466 DL = MBBI->getDebugLoc();
2467 IsFunclet = isFuncletReturnInstr(*MBBI);
2468 }
2469
2470 MachineBasicBlock::iterator EpilogStartI = MBB.end();
2471
2472 auto FinishingTouches = make_scope_exit([&]() {
2473 if (AFI->needsShadowCallStackPrologueEpilogue(MF)) {
2474 emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL,
2475 NeedsWinCFI);
2476 HasWinCFI |= NeedsWinCFI;
2477 }
2478 if (EmitCFI)
2479 emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
2480 if (AFI->shouldSignReturnAddress(MF)) {
2481 // If pac-ret+leaf is in effect, PAUTH_EPILOGUE pseudo instructions
2482 // are inserted by emitPacRetPlusLeafHardening().
2483 if (!shouldSignReturnAddressEverywhere(MF)) {
2484 BuildMI(MBB, MBB.getFirstTerminator(), DL,
2485 TII->get(AArch64::PAUTH_EPILOGUE))
2486 .setMIFlag(MachineInstr::FrameDestroy);
2487 }
2488 // AArch64PointerAuth pass will insert SEH_PACSignLR
2489 HasWinCFI |= NeedsWinCFI;
2490 }
2491 if (HasWinCFI) {
2492 BuildMI(MBB, MBB.getFirstTerminator(), DL,
2493 TII->get(AArch64::SEH_EpilogEnd))
2494 .setMIFlag(MachineInstr::FrameDestroy);
2495 if (!MF.hasWinCFI())
2496 MF.setHasWinCFI(true);
2497 }
2498 if (NeedsWinCFI) {
2499 assert(EpilogStartI != MBB.end());
2500 if (!HasWinCFI)
2501 MBB.erase(EpilogStartI);
2502 }
2503 });
2504
2505 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
2506 : MFI.getStackSize();
2507
2508 // All calls are tail calls in GHC calling conv, and functions have no
2509 // prologue/epilogue.
2510 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
2511 return;
2512
2513 // How much of the stack used by incoming arguments this function is expected
2514 // to restore in this particular epilogue.
2515 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
2516 bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
2517 MF.getFunction().isVarArg());
2518 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
2519
2520 int64_t AfterCSRPopSize = ArgumentStackToRestore;
2521 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
2522 // We cannot rely on the local stack size set in emitPrologue if the function
2523 // has funclets, as funclets have different local stack size requirements, and
2524 // the current value set in emitPrologue may be that of the containing
2525 // function.
2526 if (MF.hasEHFunclets())
2527 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
2528 if (homogeneousPrologEpilog(MF, &MBB)) {
2529 assert(!NeedsWinCFI);
2530 auto LastPopI = MBB.getFirstTerminator();
2531 if (LastPopI != MBB.begin()) {
2532 auto HomogeneousEpilog = std::prev(LastPopI);
2533 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
2534 LastPopI = HomogeneousEpilog;
2535 }
2536
2537 // Adjust local stack
2538 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2539 StackOffset::getFixed(AFI->getLocalStackSize()), TII,
2540 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2541
2542 // SP has been already adjusted while restoring callee save regs.
2543 // We've bailed-out the case with adjusting SP for arguments.
2544 assert(AfterCSRPopSize == 0);
2545 return;
2546 }
2547
2548 bool FPAfterSVECalleeSaves =
2549 Subtarget.isTargetWindows() && AFI->getSVECalleeSavedStackSize();
2550
2551 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
2552 // Assume we can't combine the last pop with the sp restore.
2553 bool CombineAfterCSRBump = false;
2554 if (FPAfterSVECalleeSaves) {
2555 AfterCSRPopSize += FixedObject;
2556 } else if (!CombineSPBump && PrologueSaveSize != 0) {
2557 MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
2558 while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
2559 AArch64InstrInfo::isSEHInstruction(*Pop))
2560 Pop = std::prev(Pop);
2561 // Converting the last ldp to a post-index ldp is valid only if the last
2562 // ldp's offset is 0.
2563 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
2564 // If the offset is 0 and the AfterCSR pop is not actually trying to
2565 // allocate more stack for arguments (in space that an untimely interrupt
2566 // may clobber), convert it to a post-index ldp.
2567 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
2568 convertCalleeSaveRestoreToSPPrePostIncDec(
2569 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
2570 MachineInstr::FrameDestroy, PrologueSaveSize);
2571 } else {
2572 // If not, make sure to emit an add after the last ldp.
2573 // We're doing this by transferring the size to be restored from the
2574 // adjustment *before* the CSR pops to the adjustment *after* the CSR
2575 // pops.
2576 AfterCSRPopSize += PrologueSaveSize;
2577 CombineAfterCSRBump = true;
2578 }
2579 }
2580
2581 // Move past the restores of the callee-saved registers.
2582 // If we plan on combining the sp bump of the local stack size and the callee
2583 // save stack size, we might need to adjust the CSR save and restore offsets.
2584 MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
2585 MachineBasicBlock::iterator Begin = MBB.begin();
2586 while (LastPopI != Begin) {
2587 --LastPopI;
2588 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
2589 (!FPAfterSVECalleeSaves && IsSVECalleeSave(LastPopI))) {
2590 ++LastPopI;
2591 break;
2592 } else if (CombineSPBump)
2593 fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
2594 NeedsWinCFI, &HasWinCFI);
2595 }
2596
2597 if (NeedsWinCFI) {
2598 // Note that there are cases where we insert SEH opcodes in the
2599 // epilogue when we had no SEH opcodes in the prologue. For
2600 // example, when there is no stack frame but there are stack
2601 // arguments. Insert the SEH_EpilogStart and remove it later if it
2602 // we didn't emit any SEH opcodes to avoid generating WinCFI for
2603 // functions that don't need it.
2604 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2605 .setMIFlag(MachineInstr::FrameDestroy);
2606 EpilogStartI = LastPopI;
2607 --EpilogStartI;
2608 }
2609
2610 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2611 switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
2612 case SwiftAsyncFramePointerMode::DeploymentBased:
2613 // Avoid the reload as it is GOT relative, and instead fall back to the
2614 // hardcoded value below. This allows a mismatch between the OS and
2615 // application without immediately terminating on the difference.
2616 [[fallthrough]];
2617 case SwiftAsyncFramePointerMode::Always:
2618 // We need to reset FP to its untagged state on return. Bit 60 is
2619 // currently used to show the presence of an extended frame.
2620
2621 // BIC x29, x29, #0x1000_0000_0000_0000
2622 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2623 AArch64::FP)
2624 .addUse(AArch64::FP)
2625 .addImm(0x10fe)
2626 .setMIFlag(MachineInstr::FrameDestroy);
2627 if (NeedsWinCFI) {
2628 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2629 .setMIFlags(MachineInstr::FrameDestroy);
2630 HasWinCFI = true;
2631 }
2632 break;
2633
2634 case SwiftAsyncFramePointerMode::Never:
2635 break;
2636 }
2637 }
2638
2639 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2640
2641 // If there is a single SP update, insert it before the ret and we're done.
2642 if (CombineSPBump) {
2643 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2644
2645 // When we are about to restore the CSRs, the CFA register is SP again.
2646 if (EmitCFI && hasFP(MF))
2647 CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
2648 .buildDefCFA(AArch64::SP, NumBytes);
2649
2650 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2651 StackOffset::getFixed(NumBytes + AfterCSRPopSize), TII,
2652 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI,
2653 EmitCFI, StackOffset::getFixed(NumBytes));
2654 return;
2655 }
2656
2657 NumBytes -= PrologueSaveSize;
2658 assert(NumBytes >= 0 && "Negative stack allocation size!?");
2659
2660 // Process the SVE callee-saves to determine what space needs to be
2661 // deallocated.
2662 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2663 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2664 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2665 if (FPAfterSVECalleeSaves)
2666 RestoreEnd = MBB.getFirstTerminator();
2667
2668 RestoreBegin = std::prev(RestoreEnd);
2669 while (RestoreBegin != MBB.begin() &&
2670 IsSVECalleeSave(std::prev(RestoreBegin)))
2671 --RestoreBegin;
2672
2673 assert(IsSVECalleeSave(RestoreBegin) &&
2674 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2675
2676 StackOffset CalleeSavedSizeAsOffset =
2677 StackOffset::getScalable(CalleeSavedSize);
2678 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2679 DeallocateAfter = CalleeSavedSizeAsOffset;
2680 }
2681
2682 // Deallocate the SVE area.
2683 if (FPAfterSVECalleeSaves) {
2684 // If the callee-save area is before FP, restoring the FP implicitly
2685 // deallocates non-callee-save SVE allocations. Otherwise, deallocate
2686 // them explicitly.
2687 if (!AFI->isStackRealigned() && !MFI.hasVarSizedObjects()) {
2688 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2689 DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2690 NeedsWinCFI, &HasWinCFI);
2691 }
2692
2693 // Deallocate callee-save non-SVE registers.
2694 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2695 StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
2696 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2697
2698 // Deallocate fixed objects.
2699 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2700 StackOffset::getFixed(FixedObject), TII,
2701 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2702
2703 // Deallocate callee-save SVE registers.
2704 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2705 DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2706 NeedsWinCFI, &HasWinCFI);
2707 } else if (SVEStackSize) {
2708 int64_t SVECalleeSavedSize = AFI->getSVECalleeSavedStackSize();
2709 // If we have stack realignment or variable-sized objects we must use the
2710 // FP to restore SVE callee saves (as there is an unknown amount of
2711 // data/padding between the SP and SVE CS area).
2712 Register BaseForSVEDealloc =
2713 (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
2714 : AArch64::SP;
2715 if (SVECalleeSavedSize && BaseForSVEDealloc == AArch64::FP) {
2716 Register CalleeSaveBase = AArch64::FP;
2717 if (int64_t CalleeSaveBaseOffset =
2718 AFI->getCalleeSaveBaseToFrameRecordOffset()) {
2719 // If we have have an non-zero offset to the non-SVE CS base we need to
2720 // compute the base address by subtracting the offest in a temporary
2721 // register first (to avoid briefly deallocating the SVE CS).
2722 CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
2723 &AArch64::GPR64RegClass);
2724 emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
2725 StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
2726 MachineInstr::FrameDestroy);
2727 }
2728 // The code below will deallocate the stack space space by moving the
2729 // SP to the start of the SVE callee-save area.
2730 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
2731 StackOffset::getScalable(-SVECalleeSavedSize), TII,
2732 MachineInstr::FrameDestroy);
2733 } else if (BaseForSVEDealloc == AArch64::SP) {
2734 if (SVECalleeSavedSize) {
2735 // Deallocate the non-SVE locals first before we can deallocate (and
2736 // restore callee saves) from the SVE area.
2737 emitFrameOffset(
2738 MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2739 StackOffset::getFixed(NumBytes), TII, MachineInstr::FrameDestroy,
2740 false, NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
2741 SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2742 NumBytes = 0;
2743 }
2744
2745 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2746 DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2747 NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
2748 SVEStackSize +
2749 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2750
2751 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2752 DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2753 NeedsWinCFI, &HasWinCFI, EmitCFI && !hasFP(MF),
2754 DeallocateAfter +
2755 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2756 }
2757 if (EmitCFI)
2758 emitCalleeSavedSVERestores(MBB, RestoreEnd);
2759 }
2760
2761 if (!hasFP(MF)) {
2762 bool RedZone = canUseRedZone(MF);
2763 // If this was a redzone leaf function, we don't need to restore the
2764 // stack pointer (but we may need to pop stack args for fastcc).
2765 if (RedZone && AfterCSRPopSize == 0)
2766 return;
2767
2768 // Pop the local variables off the stack. If there are no callee-saved
2769 // registers, it means we are actually positioned at the terminator and can
2770 // combine stack increment for the locals and the stack increment for
2771 // callee-popped arguments into (possibly) a single instruction and be done.
2772 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2773 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2774 if (NoCalleeSaveRestore)
2775 StackRestoreBytes += AfterCSRPopSize;
2776
2777 emitFrameOffset(
2778 MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2779 StackOffset::getFixed(StackRestoreBytes), TII,
2780 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2781 StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2782
2783 // If we were able to combine the local stack pop with the argument pop,
2784 // then we're done.
2785 if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2786 return;
2787 }
2788
2789 NumBytes = 0;
2790 }
2791
2792 // Restore the original stack pointer.
2793 // FIXME: Rather than doing the math here, we should instead just use
2794 // non-post-indexed loads for the restores if we aren't actually going to
2795 // be able to save any instructions.
2796 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2797 emitFrameOffset(
2798 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2799 StackOffset::getFixed(-AFI->getCalleeSaveBaseToFrameRecordOffset()),
2800 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2801 } else if (NumBytes)
2802 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2803 StackOffset::getFixed(NumBytes), TII,
2804 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2805
2806 // When we are about to restore the CSRs, the CFA register is SP again.
2807 if (EmitCFI && hasFP(MF))
2808 CFIInstBuilder(MBB, LastPopI, MachineInstr::FrameDestroy)
2809 .buildDefCFA(AArch64::SP, PrologueSaveSize);
2810
2811 // This must be placed after the callee-save restore code because that code
2812 // assumes the SP is at the same location as it was after the callee-save save
2813 // code in the prologue.
2814 if (AfterCSRPopSize) {
2815 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2816 "interrupt may have clobbered");
2817
2818 emitFrameOffset(
2819 MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2820 StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
2821 false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2822 StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2823 }
2824 }
2825
enableCFIFixup(const MachineFunction & MF) const2826 bool AArch64FrameLowering::enableCFIFixup(const MachineFunction &MF) const {
2827 return TargetFrameLowering::enableCFIFixup(MF) &&
2828 MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF);
2829 }
2830
enableFullCFIFixup(const MachineFunction & MF) const2831 bool AArch64FrameLowering::enableFullCFIFixup(const MachineFunction &MF) const {
2832 return enableCFIFixup(MF) &&
2833 MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF);
2834 }
2835
2836 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2837 /// debug info. It's the same as what we use for resolving the code-gen
2838 /// references for now. FIXME: This can go wrong when references are
2839 /// SP-relative and simple call frames aren't used.
2840 StackOffset
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const2841 AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
2842 Register &FrameReg) const {
2843 return resolveFrameIndexReference(
2844 MF, FI, FrameReg,
2845 /*PreferFP=*/
2846 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) ||
2847 MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag),
2848 /*ForSimm=*/false);
2849 }
2850
2851 StackOffset
getFrameIndexReferenceFromSP(const MachineFunction & MF,int FI) const2852 AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
2853 int FI) const {
2854 // This function serves to provide a comparable offset from a single reference
2855 // point (the value of SP at function entry) that can be used for analysis,
2856 // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
2857 // correct for all objects in the presence of VLA-area objects or dynamic
2858 // stack re-alignment.
2859
2860 const auto &MFI = MF.getFrameInfo();
2861
2862 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2863 StackOffset SVEStackSize = getSVEStackSize(MF);
2864
2865 // For VLA-area objects, just emit an offset at the end of the stack frame.
2866 // Whilst not quite correct, these objects do live at the end of the frame and
2867 // so it is more useful for analysis for the offset to reflect this.
2868 if (MFI.isVariableSizedObjectIndex(FI)) {
2869 return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
2870 }
2871
2872 // This is correct in the absence of any SVE stack objects.
2873 if (!SVEStackSize)
2874 return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
2875
2876 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2877 bool FPAfterSVECalleeSaves =
2878 isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
2879 if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
2880 if (FPAfterSVECalleeSaves &&
2881 -ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize())
2882 return StackOffset::getScalable(ObjectOffset);
2883 return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
2884 ObjectOffset);
2885 }
2886
2887 bool IsFixed = MFI.isFixedObjectIndex(FI);
2888 bool IsCSR =
2889 !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2890
2891 StackOffset ScalableOffset = {};
2892 if (!IsFixed && !IsCSR) {
2893 ScalableOffset = -SVEStackSize;
2894 } else if (FPAfterSVECalleeSaves && IsCSR) {
2895 ScalableOffset =
2896 -StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
2897 }
2898
2899 return StackOffset::getFixed(ObjectOffset) + ScalableOffset;
2900 }
2901
2902 StackOffset
getNonLocalFrameIndexReference(const MachineFunction & MF,int FI) const2903 AArch64FrameLowering::getNonLocalFrameIndexReference(const MachineFunction &MF,
2904 int FI) const {
2905 return StackOffset::getFixed(getSEHFrameIndexOffset(MF, FI));
2906 }
2907
getFPOffset(const MachineFunction & MF,int64_t ObjectOffset)2908 static StackOffset getFPOffset(const MachineFunction &MF,
2909 int64_t ObjectOffset) {
2910 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2911 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2912 const Function &F = MF.getFunction();
2913 bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
2914 unsigned FixedObject =
2915 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2916 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2917 int64_t FPAdjust =
2918 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2919 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2920 }
2921
getStackOffset(const MachineFunction & MF,int64_t ObjectOffset)2922 static StackOffset getStackOffset(const MachineFunction &MF,
2923 int64_t ObjectOffset) {
2924 const auto &MFI = MF.getFrameInfo();
2925 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2926 }
2927
2928 // TODO: This function currently does not work for scalable vectors.
getSEHFrameIndexOffset(const MachineFunction & MF,int FI) const2929 int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
2930 int FI) const {
2931 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2932 MF.getSubtarget().getRegisterInfo());
2933 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2934 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2935 ? getFPOffset(MF, ObjectOffset).getFixed()
2936 : getStackOffset(MF, ObjectOffset).getFixed();
2937 }
2938
resolveFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg,bool PreferFP,bool ForSimm) const2939 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
2940 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2941 bool ForSimm) const {
2942 const auto &MFI = MF.getFrameInfo();
2943 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2944 bool isFixed = MFI.isFixedObjectIndex(FI);
2945 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2946 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2947 PreferFP, ForSimm);
2948 }
2949
resolveFrameOffsetReference(const MachineFunction & MF,int64_t ObjectOffset,bool isFixed,bool isSVE,Register & FrameReg,bool PreferFP,bool ForSimm) const2950 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
2951 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2952 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2953 const auto &MFI = MF.getFrameInfo();
2954 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2955 MF.getSubtarget().getRegisterInfo());
2956 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2957 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2958
2959 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2960 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2961 bool isCSR =
2962 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2963
2964 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2965
2966 // Use frame pointer to reference fixed objects. Use it for locals if
2967 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2968 // reliable as a base). Make sure useFPForScavengingIndex() does the
2969 // right thing for the emergency spill slot.
2970 bool UseFP = false;
2971 if (AFI->hasStackFrame() && !isSVE) {
2972 // We shouldn't prefer using the FP to access fixed-sized stack objects when
2973 // there are scalable (SVE) objects in between the FP and the fixed-sized
2974 // objects.
2975 PreferFP &= !SVEStackSize;
2976
2977 // Note: Keeping the following as multiple 'if' statements rather than
2978 // merging to a single expression for readability.
2979 //
2980 // Argument access should always use the FP.
2981 if (isFixed) {
2982 UseFP = hasFP(MF);
2983 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2984 // References to the CSR area must use FP if we're re-aligning the stack
2985 // since the dynamically-sized alignment padding is between the SP/BP and
2986 // the CSR area.
2987 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2988 UseFP = true;
2989 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2990 // If the FPOffset is negative and we're producing a signed immediate, we
2991 // have to keep in mind that the available offset range for negative
2992 // offsets is smaller than for positive ones. If an offset is available
2993 // via the FP and the SP, use whichever is closest.
2994 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2995 PreferFP |= Offset > -FPOffset && !SVEStackSize;
2996
2997 if (FPOffset >= 0) {
2998 // If the FPOffset is positive, that'll always be best, as the SP/BP
2999 // will be even further away.
3000 UseFP = true;
3001 } else if (MFI.hasVarSizedObjects()) {
3002 // If we have variable sized objects, we can use either FP or BP, as the
3003 // SP offset is unknown. We can use the base pointer if we have one and
3004 // FP is not preferred. If not, we're stuck with using FP.
3005 bool CanUseBP = RegInfo->hasBasePointer(MF);
3006 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
3007 UseFP = PreferFP;
3008 else if (!CanUseBP) // Can't use BP. Forced to use FP.
3009 UseFP = true;
3010 // else we can use BP and FP, but the offset from FP won't fit.
3011 // That will make us scavenge registers which we can probably avoid by
3012 // using BP. If it won't fit for BP either, we'll scavenge anyway.
3013 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
3014 // Funclets access the locals contained in the parent's stack frame
3015 // via the frame pointer, so we have to use the FP in the parent
3016 // function.
3017 (void) Subtarget;
3018 assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
3019 MF.getFunction().isVarArg()) &&
3020 "Funclets should only be present on Win64");
3021 UseFP = true;
3022 } else {
3023 // We have the choice between FP and (SP or BP).
3024 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
3025 UseFP = true;
3026 }
3027 }
3028 }
3029
3030 assert(
3031 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
3032 "In the presence of dynamic stack pointer realignment, "
3033 "non-argument/CSR objects cannot be accessed through the frame pointer");
3034
3035 bool FPAfterSVECalleeSaves =
3036 isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
3037
3038 if (isSVE) {
3039 StackOffset FPOffset =
3040 StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
3041 StackOffset SPOffset =
3042 SVEStackSize +
3043 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
3044 ObjectOffset);
3045 if (FPAfterSVECalleeSaves) {
3046 FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
3047 if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) {
3048 FPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
3049 SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize());
3050 }
3051 }
3052 // Always use the FP for SVE spills if available and beneficial.
3053 if (hasFP(MF) && (SPOffset.getFixed() ||
3054 FPOffset.getScalable() < SPOffset.getScalable() ||
3055 RegInfo->hasStackRealignment(MF))) {
3056 FrameReg = RegInfo->getFrameRegister(MF);
3057 return FPOffset;
3058 }
3059
3060 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
3061 : (unsigned)AArch64::SP;
3062 return SPOffset;
3063 }
3064
3065 StackOffset ScalableOffset = {};
3066 if (FPAfterSVECalleeSaves) {
3067 // In this stack layout, the FP is in between the callee saves and other
3068 // SVE allocations.
3069 StackOffset SVECalleeSavedStack =
3070 StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
3071 if (UseFP) {
3072 if (isFixed)
3073 ScalableOffset = SVECalleeSavedStack;
3074 else if (!isCSR)
3075 ScalableOffset = SVECalleeSavedStack - SVEStackSize;
3076 } else {
3077 if (isFixed)
3078 ScalableOffset = SVEStackSize;
3079 else if (isCSR)
3080 ScalableOffset = SVEStackSize - SVECalleeSavedStack;
3081 }
3082 } else {
3083 if (UseFP && !(isFixed || isCSR))
3084 ScalableOffset = -SVEStackSize;
3085 if (!UseFP && (isFixed || isCSR))
3086 ScalableOffset = SVEStackSize;
3087 }
3088
3089 if (UseFP) {
3090 FrameReg = RegInfo->getFrameRegister(MF);
3091 return StackOffset::getFixed(FPOffset) + ScalableOffset;
3092 }
3093
3094 // Use the base pointer if we have one.
3095 if (RegInfo->hasBasePointer(MF))
3096 FrameReg = RegInfo->getBaseRegister();
3097 else {
3098 assert(!MFI.hasVarSizedObjects() &&
3099 "Can't use SP when we have var sized objects.");
3100 FrameReg = AArch64::SP;
3101 // If we're using the red zone for this function, the SP won't actually
3102 // be adjusted, so the offsets will be negative. They're also all
3103 // within range of the signed 9-bit immediate instructions.
3104 if (canUseRedZone(MF))
3105 Offset -= AFI->getLocalStackSize();
3106 }
3107
3108 return StackOffset::getFixed(Offset) + ScalableOffset;
3109 }
3110
getPrologueDeath(MachineFunction & MF,unsigned Reg)3111 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
3112 // Do not set a kill flag on values that are also marked as live-in. This
3113 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
3114 // callee saved registers.
3115 // Omitting the kill flags is conservatively correct even if the live-in
3116 // is not used after all.
3117 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
3118 return getKillRegState(!IsLiveIn);
3119 }
3120
produceCompactUnwindFrame(MachineFunction & MF)3121 static bool produceCompactUnwindFrame(MachineFunction &MF) {
3122 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
3123 AttributeList Attrs = MF.getFunction().getAttributes();
3124 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3125 return Subtarget.isTargetMachO() &&
3126 !(Subtarget.getTargetLowering()->supportSwiftError() &&
3127 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
3128 MF.getFunction().getCallingConv() != CallingConv::SwiftTail &&
3129 !requiresSaveVG(MF) && !AFI->isSVECC();
3130 }
3131
invalidateWindowsRegisterPairing(unsigned Reg1,unsigned Reg2,bool NeedsWinCFI,bool IsFirst,const TargetRegisterInfo * TRI)3132 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
3133 bool NeedsWinCFI, bool IsFirst,
3134 const TargetRegisterInfo *TRI) {
3135 // If we are generating register pairs for a Windows function that requires
3136 // EH support, then pair consecutive registers only. There are no unwind
3137 // opcodes for saves/restores of non-consecutive register pairs.
3138 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
3139 // save_lrpair.
3140 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
3141
3142 if (Reg2 == AArch64::FP)
3143 return true;
3144 if (!NeedsWinCFI)
3145 return false;
3146 if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1)
3147 return false;
3148 // If pairing a GPR with LR, the pair can be described by the save_lrpair
3149 // opcode. If this is the first register pair, it would end up with a
3150 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
3151 // if LR is paired with something else than the first register.
3152 // The save_lrpair opcode requires the first register to be an odd one.
3153 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
3154 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
3155 return false;
3156 return true;
3157 }
3158
3159 /// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
3160 /// WindowsCFI requires that only consecutive registers can be paired.
3161 /// LR and FP need to be allocated together when the frame needs to save
3162 /// the frame-record. This means any other register pairing with LR is invalid.
invalidateRegisterPairing(unsigned Reg1,unsigned Reg2,bool UsesWinAAPCS,bool NeedsWinCFI,bool NeedsFrameRecord,bool IsFirst,const TargetRegisterInfo * TRI)3163 static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
3164 bool UsesWinAAPCS, bool NeedsWinCFI,
3165 bool NeedsFrameRecord, bool IsFirst,
3166 const TargetRegisterInfo *TRI) {
3167 if (UsesWinAAPCS)
3168 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst,
3169 TRI);
3170
3171 // If we need to store the frame record, don't pair any register
3172 // with LR other than FP.
3173 if (NeedsFrameRecord)
3174 return Reg2 == AArch64::LR;
3175
3176 return false;
3177 }
3178
3179 namespace {
3180
3181 struct RegPairInfo {
3182 unsigned Reg1 = AArch64::NoRegister;
3183 unsigned Reg2 = AArch64::NoRegister;
3184 int FrameIdx;
3185 int Offset;
3186 enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
3187 const TargetRegisterClass *RC;
3188
3189 RegPairInfo() = default;
3190
isPaired__anonc2fd70990711::RegPairInfo3191 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
3192
isScalable__anonc2fd70990711::RegPairInfo3193 bool isScalable() const { return Type == PPR || Type == ZPR; }
3194 };
3195
3196 } // end anonymous namespace
3197
findFreePredicateReg(BitVector & SavedRegs)3198 unsigned findFreePredicateReg(BitVector &SavedRegs) {
3199 for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
3200 if (SavedRegs.test(PReg)) {
3201 unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
3202 return PNReg;
3203 }
3204 }
3205 return AArch64::NoRegister;
3206 }
3207
3208 // The multivector LD/ST are available only for SME or SVE2p1 targets
enableMultiVectorSpillFill(const AArch64Subtarget & Subtarget,MachineFunction & MF)3209 bool enableMultiVectorSpillFill(const AArch64Subtarget &Subtarget,
3210 MachineFunction &MF) {
3211 if (DisableMultiVectorSpillFill)
3212 return false;
3213
3214 SMEAttrs FuncAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
3215 bool IsLocallyStreaming =
3216 FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
3217
3218 // Only when in streaming mode SME2 instructions can be safely used.
3219 // It is not safe to use SME2 instructions when in streaming compatible or
3220 // locally streaming mode.
3221 return Subtarget.hasSVE2p1() ||
3222 (Subtarget.hasSME2() &&
3223 (!IsLocallyStreaming && Subtarget.isStreaming()));
3224 }
3225
computeCalleeSaveRegisterPairs(MachineFunction & MF,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI,SmallVectorImpl<RegPairInfo> & RegPairs,bool NeedsFrameRecord)3226 static void computeCalleeSaveRegisterPairs(
3227 MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
3228 const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
3229 bool NeedsFrameRecord) {
3230
3231 if (CSI.empty())
3232 return;
3233
3234 bool IsWindows = isTargetWindows(MF);
3235 bool NeedsWinCFI = needsWinCFI(MF);
3236 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3237 unsigned StackHazardSize = getStackHazardSize(MF);
3238 MachineFrameInfo &MFI = MF.getFrameInfo();
3239 CallingConv::ID CC = MF.getFunction().getCallingConv();
3240 unsigned Count = CSI.size();
3241 (void)CC;
3242 // MachO's compact unwind format relies on all registers being stored in
3243 // pairs.
3244 assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
3245 CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
3246 CC == CallingConv::Win64 || (Count & 1) == 0) &&
3247 "Odd number of callee-saved regs to spill!");
3248 int ByteOffset = AFI->getCalleeSavedStackSize();
3249 int StackFillDir = -1;
3250 int RegInc = 1;
3251 unsigned FirstReg = 0;
3252 if (NeedsWinCFI) {
3253 // For WinCFI, fill the stack from the bottom up.
3254 ByteOffset = 0;
3255 StackFillDir = 1;
3256 // As the CSI array is reversed to match PrologEpilogInserter, iterate
3257 // backwards, to pair up registers starting from lower numbered registers.
3258 RegInc = -1;
3259 FirstReg = Count - 1;
3260 }
3261 bool FPAfterSVECalleeSaves = IsWindows && AFI->getSVECalleeSavedStackSize();
3262 int ScalableByteOffset =
3263 FPAfterSVECalleeSaves ? 0 : AFI->getSVECalleeSavedStackSize();
3264 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
3265 Register LastReg = 0;
3266
3267 // When iterating backwards, the loop condition relies on unsigned wraparound.
3268 for (unsigned i = FirstReg; i < Count; i += RegInc) {
3269 RegPairInfo RPI;
3270 RPI.Reg1 = CSI[i].getReg();
3271
3272 if (AArch64::GPR64RegClass.contains(RPI.Reg1)) {
3273 RPI.Type = RegPairInfo::GPR;
3274 RPI.RC = &AArch64::GPR64RegClass;
3275 } else if (AArch64::FPR64RegClass.contains(RPI.Reg1)) {
3276 RPI.Type = RegPairInfo::FPR64;
3277 RPI.RC = &AArch64::FPR64RegClass;
3278 } else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) {
3279 RPI.Type = RegPairInfo::FPR128;
3280 RPI.RC = &AArch64::FPR128RegClass;
3281 } else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) {
3282 RPI.Type = RegPairInfo::ZPR;
3283 RPI.RC = &AArch64::ZPRRegClass;
3284 } else if (AArch64::PPRRegClass.contains(RPI.Reg1)) {
3285 RPI.Type = RegPairInfo::PPR;
3286 RPI.RC = &AArch64::PPRRegClass;
3287 } else if (RPI.Reg1 == AArch64::VG) {
3288 RPI.Type = RegPairInfo::VG;
3289 RPI.RC = &AArch64::FIXED_REGSRegClass;
3290 } else {
3291 llvm_unreachable("Unsupported register class.");
3292 }
3293
3294 // Add the stack hazard size as we transition from GPR->FPR CSRs.
3295 if (AFI->hasStackHazardSlotIndex() &&
3296 (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
3297 AArch64InstrInfo::isFpOrNEON(RPI.Reg1))
3298 ByteOffset += StackFillDir * StackHazardSize;
3299 LastReg = RPI.Reg1;
3300
3301 int Scale = TRI->getSpillSize(*RPI.RC);
3302 // Add the next reg to the pair if it is in the same register class.
3303 if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
3304 MCRegister NextReg = CSI[i + RegInc].getReg();
3305 bool IsFirst = i == FirstReg;
3306 switch (RPI.Type) {
3307 case RegPairInfo::GPR:
3308 if (AArch64::GPR64RegClass.contains(NextReg) &&
3309 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
3310 NeedsWinCFI, NeedsFrameRecord, IsFirst,
3311 TRI))
3312 RPI.Reg2 = NextReg;
3313 break;
3314 case RegPairInfo::FPR64:
3315 if (AArch64::FPR64RegClass.contains(NextReg) &&
3316 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
3317 IsFirst, TRI))
3318 RPI.Reg2 = NextReg;
3319 break;
3320 case RegPairInfo::FPR128:
3321 if (AArch64::FPR128RegClass.contains(NextReg))
3322 RPI.Reg2 = NextReg;
3323 break;
3324 case RegPairInfo::PPR:
3325 break;
3326 case RegPairInfo::ZPR:
3327 if (AFI->getPredicateRegForFillSpill() != 0 &&
3328 ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
3329 // Calculate offset of register pair to see if pair instruction can be
3330 // used.
3331 int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
3332 if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
3333 RPI.Reg2 = NextReg;
3334 }
3335 break;
3336 case RegPairInfo::VG:
3337 break;
3338 }
3339 }
3340
3341 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
3342 // list to come in sorted by frame index so that we can issue the store
3343 // pair instructions directly. Assert if we see anything otherwise.
3344 //
3345 // The order of the registers in the list is controlled by
3346 // getCalleeSavedRegs(), so they will always be in-order, as well.
3347 assert((!RPI.isPaired() ||
3348 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
3349 "Out of order callee saved regs!");
3350
3351 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
3352 RPI.Reg1 == AArch64::LR) &&
3353 "FrameRecord must be allocated together with LR");
3354
3355 // Windows AAPCS has FP and LR reversed.
3356 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
3357 RPI.Reg2 == AArch64::LR) &&
3358 "FrameRecord must be allocated together with LR");
3359
3360 // MachO's compact unwind format relies on all registers being stored in
3361 // adjacent register pairs.
3362 assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost ||
3363 CC == CallingConv::PreserveAll || CC == CallingConv::CXX_FAST_TLS ||
3364 CC == CallingConv::Win64 ||
3365 (RPI.isPaired() &&
3366 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
3367 RPI.Reg1 + 1 == RPI.Reg2))) &&
3368 "Callee-save registers not saved as adjacent register pair!");
3369
3370 RPI.FrameIdx = CSI[i].getFrameIdx();
3371 if (NeedsWinCFI &&
3372 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
3373 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
3374
3375 // Realign the scalable offset if necessary. This is relevant when
3376 // spilling predicates on Windows.
3377 if (RPI.isScalable() && ScalableByteOffset % Scale != 0) {
3378 ScalableByteOffset = alignTo(ScalableByteOffset, Scale);
3379 }
3380
3381 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3382 assert(OffsetPre % Scale == 0);
3383
3384 if (RPI.isScalable())
3385 ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3386 else
3387 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3388
3389 // Swift's async context is directly before FP, so allocate an extra
3390 // 8 bytes for it.
3391 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3392 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3393 (IsWindows && RPI.Reg2 == AArch64::LR)))
3394 ByteOffset += StackFillDir * 8;
3395
3396 // Round up size of non-pair to pair size if we need to pad the
3397 // callee-save area to ensure 16-byte alignment.
3398 if (NeedGapToAlignStack && !NeedsWinCFI && !RPI.isScalable() &&
3399 RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired() &&
3400 ByteOffset % 16 != 0) {
3401 ByteOffset += 8 * StackFillDir;
3402 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
3403 // A stack frame with a gap looks like this, bottom up:
3404 // d9, d8. x21, gap, x20, x19.
3405 // Set extra alignment on the x21 object to create the gap above it.
3406 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
3407 NeedGapToAlignStack = false;
3408 }
3409
3410 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3411 assert(OffsetPost % Scale == 0);
3412 // If filling top down (default), we want the offset after incrementing it.
3413 // If filling bottom up (WinCFI) we need the original offset.
3414 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
3415
3416 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
3417 // Swift context can directly precede FP.
3418 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3419 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3420 (IsWindows && RPI.Reg2 == AArch64::LR)))
3421 Offset += 8;
3422 RPI.Offset = Offset / Scale;
3423
3424 assert((!RPI.isPaired() ||
3425 (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
3426 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
3427 "Offset out of bounds for LDP/STP immediate");
3428
3429 auto isFrameRecord = [&] {
3430 if (RPI.isPaired())
3431 return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
3432 : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
3433 // Otherwise, look for the frame record as two unpaired registers. This is
3434 // needed for -aarch64-stack-hazard-size=<val>, which disables register
3435 // pairing (as the padding may be too large for the LDP/STP offset). Note:
3436 // On Windows, this check works out as current reg == FP, next reg == LR,
3437 // and on other platforms current reg == FP, previous reg == LR. This
3438 // works out as the correct pre-increment or post-increment offsets
3439 // respectively.
3440 return i > 0 && RPI.Reg1 == AArch64::FP &&
3441 CSI[i - 1].getReg() == AArch64::LR;
3442 };
3443
3444 // Save the offset to frame record so that the FP register can point to the
3445 // innermost frame record (spilled FP and LR registers).
3446 if (NeedsFrameRecord && isFrameRecord())
3447 AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
3448
3449 RegPairs.push_back(RPI);
3450 if (RPI.isPaired())
3451 i += RegInc;
3452 }
3453 if (NeedsWinCFI) {
3454 // If we need an alignment gap in the stack, align the topmost stack
3455 // object. A stack frame with a gap looks like this, bottom up:
3456 // x19, d8. d9, gap.
3457 // Set extra alignment on the topmost stack object (the first element in
3458 // CSI, which goes top down), to create the gap above it.
3459 if (AFI->hasCalleeSaveStackFreeSpace())
3460 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
3461 // We iterated bottom up over the registers; flip RegPairs back to top
3462 // down order.
3463 std::reverse(RegPairs.begin(), RegPairs.end());
3464 }
3465 }
3466
spillCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,ArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const3467 bool AArch64FrameLowering::spillCalleeSavedRegisters(
3468 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
3469 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3470 MachineFunction &MF = *MBB.getParent();
3471 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3472 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3473 bool NeedsWinCFI = needsWinCFI(MF);
3474 DebugLoc DL;
3475 SmallVector<RegPairInfo, 8> RegPairs;
3476
3477 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
3478
3479 MachineRegisterInfo &MRI = MF.getRegInfo();
3480 // Refresh the reserved regs in case there are any potential changes since the
3481 // last freeze.
3482 MRI.freezeReservedRegs();
3483
3484 if (homogeneousPrologEpilog(MF)) {
3485 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
3486 .setMIFlag(MachineInstr::FrameSetup);
3487
3488 for (auto &RPI : RegPairs) {
3489 MIB.addReg(RPI.Reg1);
3490 MIB.addReg(RPI.Reg2);
3491
3492 // Update register live in.
3493 if (!MRI.isReserved(RPI.Reg1))
3494 MBB.addLiveIn(RPI.Reg1);
3495 if (RPI.isPaired() && !MRI.isReserved(RPI.Reg2))
3496 MBB.addLiveIn(RPI.Reg2);
3497 }
3498 return true;
3499 }
3500 bool PTrueCreated = false;
3501 for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
3502 unsigned Reg1 = RPI.Reg1;
3503 unsigned Reg2 = RPI.Reg2;
3504 unsigned StrOpc;
3505
3506 // Issue sequence of spills for cs regs. The first spill may be converted
3507 // to a pre-decrement store later by emitPrologue if the callee-save stack
3508 // area allocation can't be combined with the local stack area allocation.
3509 // For example:
3510 // stp x22, x21, [sp, #0] // addImm(+0)
3511 // stp x20, x19, [sp, #16] // addImm(+2)
3512 // stp fp, lr, [sp, #32] // addImm(+4)
3513 // Rationale: This sequence saves uop updates compared to a sequence of
3514 // pre-increment spills like stp xi,xj,[sp,#-16]!
3515 // Note: Similar rationale and sequence for restores in epilog.
3516 unsigned Size = TRI->getSpillSize(*RPI.RC);
3517 Align Alignment = TRI->getSpillAlign(*RPI.RC);
3518 switch (RPI.Type) {
3519 case RegPairInfo::GPR:
3520 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
3521 break;
3522 case RegPairInfo::FPR64:
3523 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
3524 break;
3525 case RegPairInfo::FPR128:
3526 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
3527 break;
3528 case RegPairInfo::ZPR:
3529 StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
3530 break;
3531 case RegPairInfo::PPR:
3532 StrOpc =
3533 Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
3534 break;
3535 case RegPairInfo::VG:
3536 StrOpc = AArch64::STRXui;
3537 break;
3538 }
3539
3540 unsigned X0Scratch = AArch64::NoRegister;
3541 if (Reg1 == AArch64::VG) {
3542 // Find an available register to store value of VG to.
3543 Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
3544 assert(Reg1 != AArch64::NoRegister);
3545 SMEAttrs Attrs = AFI->getSMEFnAttrs();
3546
3547 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
3548 AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
3549 // For locally-streaming functions, we need to store both the streaming
3550 // & non-streaming VG. Spill the streaming value first.
3551 BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
3552 .addImm(1)
3553 .setMIFlag(MachineInstr::FrameSetup);
3554 BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
3555 .addReg(Reg1)
3556 .addImm(3)
3557 .addImm(63)
3558 .setMIFlag(MachineInstr::FrameSetup);
3559
3560 AFI->setStreamingVGIdx(RPI.FrameIdx);
3561 } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
3562 BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
3563 .addImm(31)
3564 .addImm(1)
3565 .setMIFlag(MachineInstr::FrameSetup);
3566 AFI->setVGIdx(RPI.FrameIdx);
3567 } else {
3568 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
3569 if (llvm::any_of(
3570 MBB.liveins(),
3571 [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
3572 return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
3573 AArch64::X0, LiveIn.PhysReg);
3574 }))
3575 X0Scratch = Reg1;
3576
3577 if (X0Scratch != AArch64::NoRegister)
3578 BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
3579 .addReg(AArch64::XZR)
3580 .addReg(AArch64::X0, RegState::Undef)
3581 .addReg(AArch64::X0, RegState::Implicit)
3582 .setMIFlag(MachineInstr::FrameSetup);
3583
3584 const uint32_t *RegMask = TRI->getCallPreservedMask(
3585 MF,
3586 CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
3587 BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
3588 .addExternalSymbol("__arm_get_current_vg")
3589 .addRegMask(RegMask)
3590 .addReg(AArch64::X0, RegState::ImplicitDefine)
3591 .setMIFlag(MachineInstr::FrameSetup);
3592 Reg1 = AArch64::X0;
3593 AFI->setVGIdx(RPI.FrameIdx);
3594 }
3595 }
3596
3597 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
3598 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3599 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3600 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3601 dbgs() << ")\n");
3602
3603 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
3604 "Windows unwdinding requires a consecutive (FP,LR) pair");
3605 // Windows unwind codes require consecutive registers if registers are
3606 // paired. Make the switch here, so that the code below will save (x,x+1)
3607 // and not (x+1,x).
3608 unsigned FrameIdxReg1 = RPI.FrameIdx;
3609 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3610 if (NeedsWinCFI && RPI.isPaired()) {
3611 std::swap(Reg1, Reg2);
3612 std::swap(FrameIdxReg1, FrameIdxReg2);
3613 }
3614
3615 if (RPI.isPaired() && RPI.isScalable()) {
3616 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3617 MF.getSubtarget<AArch64Subtarget>();
3618 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3619 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3620 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3621 "Expects SVE2.1 or SME2 target and a predicate register");
3622 #ifdef EXPENSIVE_CHECKS
3623 auto IsPPR = [](const RegPairInfo &c) {
3624 return c.Reg1 == RegPairInfo::PPR;
3625 };
3626 auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
3627 auto IsZPR = [](const RegPairInfo &c) {
3628 return c.Type == RegPairInfo::ZPR;
3629 };
3630 auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
3631 assert(!(PPRBegin < ZPRBegin) &&
3632 "Expected callee save predicate to be handled first");
3633 #endif
3634 if (!PTrueCreated) {
3635 PTrueCreated = true;
3636 BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
3637 .setMIFlags(MachineInstr::FrameSetup);
3638 }
3639 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
3640 if (!MRI.isReserved(Reg1))
3641 MBB.addLiveIn(Reg1);
3642 if (!MRI.isReserved(Reg2))
3643 MBB.addLiveIn(Reg2);
3644 MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
3645 MIB.addMemOperand(MF.getMachineMemOperand(
3646 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3647 MachineMemOperand::MOStore, Size, Alignment));
3648 MIB.addReg(PnReg);
3649 MIB.addReg(AArch64::SP)
3650 .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
3651 // where 2*vscale is implicit
3652 .setMIFlag(MachineInstr::FrameSetup);
3653 MIB.addMemOperand(MF.getMachineMemOperand(
3654 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3655 MachineMemOperand::MOStore, Size, Alignment));
3656 if (NeedsWinCFI)
3657 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
3658 } else { // The code when the pair of ZReg is not present
3659 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
3660 if (!MRI.isReserved(Reg1))
3661 MBB.addLiveIn(Reg1);
3662 if (RPI.isPaired()) {
3663 if (!MRI.isReserved(Reg2))
3664 MBB.addLiveIn(Reg2);
3665 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
3666 MIB.addMemOperand(MF.getMachineMemOperand(
3667 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3668 MachineMemOperand::MOStore, Size, Alignment));
3669 }
3670 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
3671 .addReg(AArch64::SP)
3672 .addImm(RPI.Offset) // [sp, #offset*vscale],
3673 // where factor*vscale is implicit
3674 .setMIFlag(MachineInstr::FrameSetup);
3675 MIB.addMemOperand(MF.getMachineMemOperand(
3676 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3677 MachineMemOperand::MOStore, Size, Alignment));
3678 if (NeedsWinCFI)
3679 InsertSEH(MIB, TII, MachineInstr::FrameSetup);
3680 }
3681 // Update the StackIDs of the SVE stack slots.
3682 MachineFrameInfo &MFI = MF.getFrameInfo();
3683 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
3684 MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
3685 if (RPI.isPaired())
3686 MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
3687 }
3688
3689 if (X0Scratch != AArch64::NoRegister)
3690 BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
3691 .addReg(AArch64::XZR)
3692 .addReg(X0Scratch, RegState::Undef)
3693 .addReg(X0Scratch, RegState::Implicit)
3694 .setMIFlag(MachineInstr::FrameSetup);
3695 }
3696 return true;
3697 }
3698
restoreCalleeSavedRegisters(MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,MutableArrayRef<CalleeSavedInfo> CSI,const TargetRegisterInfo * TRI) const3699 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3700 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
3701 MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
3702 MachineFunction &MF = *MBB.getParent();
3703 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
3704 DebugLoc DL;
3705 SmallVector<RegPairInfo, 8> RegPairs;
3706 bool NeedsWinCFI = needsWinCFI(MF);
3707
3708 if (MBBI != MBB.end())
3709 DL = MBBI->getDebugLoc();
3710
3711 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
3712 if (homogeneousPrologEpilog(MF, &MBB)) {
3713 auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
3714 .setMIFlag(MachineInstr::FrameDestroy);
3715 for (auto &RPI : RegPairs) {
3716 MIB.addReg(RPI.Reg1, RegState::Define);
3717 MIB.addReg(RPI.Reg2, RegState::Define);
3718 }
3719 return true;
3720 }
3721
3722 // For performance reasons restore SVE register in increasing order
3723 auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
3724 auto PPRBegin = llvm::find_if(RegPairs, IsPPR);
3725 auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
3726 std::reverse(PPRBegin, PPREnd);
3727 auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
3728 auto ZPRBegin = llvm::find_if(RegPairs, IsZPR);
3729 auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
3730 std::reverse(ZPRBegin, ZPREnd);
3731
3732 bool PTrueCreated = false;
3733 for (const RegPairInfo &RPI : RegPairs) {
3734 unsigned Reg1 = RPI.Reg1;
3735 unsigned Reg2 = RPI.Reg2;
3736
3737 // Issue sequence of restores for cs regs. The last restore may be converted
3738 // to a post-increment load later by emitEpilogue if the callee-save stack
3739 // area allocation can't be combined with the local stack area allocation.
3740 // For example:
3741 // ldp fp, lr, [sp, #32] // addImm(+4)
3742 // ldp x20, x19, [sp, #16] // addImm(+2)
3743 // ldp x22, x21, [sp, #0] // addImm(+0)
3744 // Note: see comment in spillCalleeSavedRegisters()
3745 unsigned LdrOpc;
3746 unsigned Size = TRI->getSpillSize(*RPI.RC);
3747 Align Alignment = TRI->getSpillAlign(*RPI.RC);
3748 switch (RPI.Type) {
3749 case RegPairInfo::GPR:
3750 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
3751 break;
3752 case RegPairInfo::FPR64:
3753 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
3754 break;
3755 case RegPairInfo::FPR128:
3756 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
3757 break;
3758 case RegPairInfo::ZPR:
3759 LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
3760 break;
3761 case RegPairInfo::PPR:
3762 LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
3763 : AArch64::LDR_PXI;
3764 break;
3765 case RegPairInfo::VG:
3766 continue;
3767 }
3768 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
3769 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3770 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3771 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3772 dbgs() << ")\n");
3773
3774 // Windows unwind codes require consecutive registers if registers are
3775 // paired. Make the switch here, so that the code below will save (x,x+1)
3776 // and not (x+1,x).
3777 unsigned FrameIdxReg1 = RPI.FrameIdx;
3778 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3779 if (NeedsWinCFI && RPI.isPaired()) {
3780 std::swap(Reg1, Reg2);
3781 std::swap(FrameIdxReg1, FrameIdxReg2);
3782 }
3783
3784 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3785 if (RPI.isPaired() && RPI.isScalable()) {
3786 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3787 MF.getSubtarget<AArch64Subtarget>();
3788 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3789 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3790 "Expects SVE2.1 or SME2 target and a predicate register");
3791 #ifdef EXPENSIVE_CHECKS
3792 assert(!(PPRBegin < ZPRBegin) &&
3793 "Expected callee save predicate to be handled first");
3794 #endif
3795 if (!PTrueCreated) {
3796 PTrueCreated = true;
3797 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
3798 .setMIFlags(MachineInstr::FrameDestroy);
3799 }
3800 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
3801 MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
3802 getDefRegState(true));
3803 MIB.addMemOperand(MF.getMachineMemOperand(
3804 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3805 MachineMemOperand::MOLoad, Size, Alignment));
3806 MIB.addReg(PnReg);
3807 MIB.addReg(AArch64::SP)
3808 .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
3809 // where 2*vscale is implicit
3810 .setMIFlag(MachineInstr::FrameDestroy);
3811 MIB.addMemOperand(MF.getMachineMemOperand(
3812 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3813 MachineMemOperand::MOLoad, Size, Alignment));
3814 if (NeedsWinCFI)
3815 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
3816 } else {
3817 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
3818 if (RPI.isPaired()) {
3819 MIB.addReg(Reg2, getDefRegState(true));
3820 MIB.addMemOperand(MF.getMachineMemOperand(
3821 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3822 MachineMemOperand::MOLoad, Size, Alignment));
3823 }
3824 MIB.addReg(Reg1, getDefRegState(true));
3825 MIB.addReg(AArch64::SP)
3826 .addImm(RPI.Offset) // [sp, #offset*vscale]
3827 // where factor*vscale is implicit
3828 .setMIFlag(MachineInstr::FrameDestroy);
3829 MIB.addMemOperand(MF.getMachineMemOperand(
3830 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3831 MachineMemOperand::MOLoad, Size, Alignment));
3832 if (NeedsWinCFI)
3833 InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
3834 }
3835 }
3836 return true;
3837 }
3838
3839 // Return the FrameID for a MMO.
getMMOFrameID(MachineMemOperand * MMO,const MachineFrameInfo & MFI)3840 static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
3841 const MachineFrameInfo &MFI) {
3842 auto *PSV =
3843 dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
3844 if (PSV)
3845 return std::optional<int>(PSV->getFrameIndex());
3846
3847 if (MMO->getValue()) {
3848 if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject(MMO->getValue()))) {
3849 for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd();
3850 FI++)
3851 if (MFI.getObjectAllocation(FI) == Al)
3852 return FI;
3853 }
3854 }
3855
3856 return std::nullopt;
3857 }
3858
3859 // Return the FrameID for a Load/Store instruction by looking at the first MMO.
getLdStFrameID(const MachineInstr & MI,const MachineFrameInfo & MFI)3860 static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3861 const MachineFrameInfo &MFI) {
3862 if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3863 return std::nullopt;
3864
3865 return getMMOFrameID(*MI.memoperands_begin(), MFI);
3866 }
3867
3868 // Check if a Hazard slot is needed for the current function, and if so create
3869 // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3870 // which can be used to determine if any hazard padding is needed.
determineStackHazardSlot(MachineFunction & MF,BitVector & SavedRegs) const3871 void AArch64FrameLowering::determineStackHazardSlot(
3872 MachineFunction &MF, BitVector &SavedRegs) const {
3873 unsigned StackHazardSize = getStackHazardSize(MF);
3874 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3875 if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3876 AFI->hasStackHazardSlotIndex())
3877 return;
3878
3879 // Stack hazards are only needed in streaming functions.
3880 SMEAttrs Attrs = AFI->getSMEFnAttrs();
3881 if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody())
3882 return;
3883
3884 MachineFrameInfo &MFI = MF.getFrameInfo();
3885
3886 // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3887 // stack objects.
3888 bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
3889 return AArch64::FPR64RegClass.contains(Reg) ||
3890 AArch64::FPR128RegClass.contains(Reg) ||
3891 AArch64::ZPRRegClass.contains(Reg) ||
3892 AArch64::PPRRegClass.contains(Reg);
3893 });
3894 bool HasFPRStackObjects = false;
3895 if (!HasFPRCSRs) {
3896 std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
3897 for (auto &MBB : MF) {
3898 for (auto &MI : MBB) {
3899 std::optional<int> FI = getLdStFrameID(MI, MFI);
3900 if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
3901 if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
3902 AArch64InstrInfo::isFpOrNEON(MI))
3903 FrameObjects[*FI] |= 2;
3904 else
3905 FrameObjects[*FI] |= 1;
3906 }
3907 }
3908 }
3909 HasFPRStackObjects =
3910 any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; });
3911 }
3912
3913 if (HasFPRCSRs || HasFPRStackObjects) {
3914 int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false);
3915 LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size "
3916 << StackHazardSize << "\n");
3917 AFI->setStackHazardSlotIndex(ID);
3918 }
3919 }
3920
determineCalleeSaves(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const3921 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3922 BitVector &SavedRegs,
3923 RegScavenger *RS) const {
3924 // All calls are tail calls in GHC calling conv, and functions have no
3925 // prologue/epilogue.
3926 if (MF.getFunction().getCallingConv() == CallingConv::GHC)
3927 return;
3928
3929 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
3930 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
3931 MF.getSubtarget().getRegisterInfo());
3932 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
3933 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
3934 unsigned UnspilledCSGPR = AArch64::NoRegister;
3935 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
3936
3937 MachineFrameInfo &MFI = MF.getFrameInfo();
3938 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
3939
3940 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
3941 ? RegInfo->getBaseRegister()
3942 : (unsigned)AArch64::NoRegister;
3943
3944 unsigned ExtraCSSpill = 0;
3945 bool HasUnpairedGPR64 = false;
3946 bool HasPairZReg = false;
3947 BitVector UserReservedRegs = RegInfo->getUserReservedRegs(MF);
3948 BitVector ReservedRegs = RegInfo->getReservedRegs(MF);
3949
3950 // Figure out which callee-saved registers to save/restore.
3951 for (unsigned i = 0; CSRegs[i]; ++i) {
3952 const unsigned Reg = CSRegs[i];
3953
3954 // Add the base pointer register to SavedRegs if it is callee-save.
3955 if (Reg == BasePointerReg)
3956 SavedRegs.set(Reg);
3957
3958 // Don't save manually reserved registers set through +reserve-x#i,
3959 // even for callee-saved registers, as per GCC's behavior.
3960 if (UserReservedRegs[Reg]) {
3961 SavedRegs.reset(Reg);
3962 continue;
3963 }
3964
3965 bool RegUsed = SavedRegs.test(Reg);
3966 unsigned PairedReg = AArch64::NoRegister;
3967 const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
3968 if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
3969 AArch64::FPR128RegClass.contains(Reg)) {
3970 // Compensate for odd numbers of GP CSRs.
3971 // For now, all the known cases of odd number of CSRs are of GPRs.
3972 if (HasUnpairedGPR64)
3973 PairedReg = CSRegs[i % 2 == 0 ? i - 1 : i + 1];
3974 else
3975 PairedReg = CSRegs[i ^ 1];
3976 }
3977
3978 // If the function requires all the GP registers to save (SavedRegs),
3979 // and there are an odd number of GP CSRs at the same time (CSRegs),
3980 // PairedReg could be in a different register class from Reg, which would
3981 // lead to a FPR (usually D8) accidentally being marked saved.
3982 if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(PairedReg)) {
3983 PairedReg = AArch64::NoRegister;
3984 HasUnpairedGPR64 = true;
3985 }
3986 assert(PairedReg == AArch64::NoRegister ||
3987 AArch64::GPR64RegClass.contains(Reg, PairedReg) ||
3988 AArch64::FPR64RegClass.contains(Reg, PairedReg) ||
3989 AArch64::FPR128RegClass.contains(Reg, PairedReg));
3990
3991 if (!RegUsed) {
3992 if (AArch64::GPR64RegClass.contains(Reg) && !ReservedRegs[Reg]) {
3993 UnspilledCSGPR = Reg;
3994 UnspilledCSGPRPaired = PairedReg;
3995 }
3996 continue;
3997 }
3998
3999 // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
4000 // spilled. If all of p0-p3 are used as return values p4 is must be free
4001 // to reload p8-p15.
4002 if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
4003 AArch64::PPR_p8to15RegClass.contains(Reg)) {
4004 SavedRegs.set(AArch64::P4);
4005 }
4006
4007 // MachO's compact unwind format relies on all registers being stored in
4008 // pairs.
4009 // FIXME: the usual format is actually better if unwinding isn't needed.
4010 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
4011 !SavedRegs.test(PairedReg)) {
4012 SavedRegs.set(PairedReg);
4013 if (AArch64::GPR64RegClass.contains(PairedReg) &&
4014 !ReservedRegs[PairedReg])
4015 ExtraCSSpill = PairedReg;
4016 }
4017 // Check if there is a pair of ZRegs, so it can select PReg for spill/fill
4018 HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
4019 SavedRegs.test(CSRegs[i ^ 1]));
4020 }
4021
4022 if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) {
4023 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4024 // Find a suitable predicate register for the multi-vector spill/fill
4025 // instructions.
4026 unsigned PnReg = findFreePredicateReg(SavedRegs);
4027 if (PnReg != AArch64::NoRegister)
4028 AFI->setPredicateRegForFillSpill(PnReg);
4029 // If no free callee-save has been found assign one.
4030 if (!AFI->getPredicateRegForFillSpill() &&
4031 MF.getFunction().getCallingConv() ==
4032 CallingConv::AArch64_SVE_VectorCall) {
4033 SavedRegs.set(AArch64::P8);
4034 AFI->setPredicateRegForFillSpill(AArch64::PN8);
4035 }
4036
4037 assert(!ReservedRegs[AFI->getPredicateRegForFillSpill()] &&
4038 "Predicate cannot be a reserved register");
4039 }
4040
4041 if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
4042 !Subtarget.isTargetWindows()) {
4043 // For Windows calling convention on a non-windows OS, where X18 is treated
4044 // as reserved, back up X18 when entering non-windows code (marked with the
4045 // Windows calling convention) and restore when returning regardless of
4046 // whether the individual function uses it - it might call other functions
4047 // that clobber it.
4048 SavedRegs.set(AArch64::X18);
4049 }
4050
4051 // Calculates the callee saved stack size.
4052 unsigned CSStackSize = 0;
4053 unsigned SVECSStackSize = 0;
4054 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
4055 for (unsigned Reg : SavedRegs.set_bits()) {
4056 auto *RC = TRI->getMinimalPhysRegClass(Reg);
4057 assert(RC && "expected register class!");
4058 auto SpillSize = TRI->getSpillSize(*RC);
4059 if (AArch64::PPRRegClass.contains(Reg) ||
4060 AArch64::ZPRRegClass.contains(Reg))
4061 SVECSStackSize += SpillSize;
4062 else
4063 CSStackSize += SpillSize;
4064 }
4065
4066 // Save number of saved regs, so we can easily update CSStackSize later to
4067 // account for any additional 64-bit GPR saves. Note: After this point
4068 // only 64-bit GPRs can be added to SavedRegs.
4069 unsigned NumSavedRegs = SavedRegs.count();
4070
4071 // Increase the callee-saved stack size if the function has streaming mode
4072 // changes, as we will need to spill the value of the VG register.
4073 // For locally streaming functions, we spill both the streaming and
4074 // non-streaming VG value.
4075 SMEAttrs Attrs = AFI->getSMEFnAttrs();
4076 if (requiresSaveVG(MF)) {
4077 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
4078 CSStackSize += 16;
4079 else
4080 CSStackSize += 8;
4081 }
4082
4083 // Determine if a Hazard slot should be used, and increase the CSStackSize by
4084 // StackHazardSize if so.
4085 determineStackHazardSlot(MF, SavedRegs);
4086 if (AFI->hasStackHazardSlotIndex())
4087 CSStackSize += getStackHazardSize(MF);
4088
4089 // If we must call __arm_get_current_vg in the prologue preserve the LR.
4090 if (requiresSaveVG(MF) && !Subtarget.hasSVE())
4091 SavedRegs.set(AArch64::LR);
4092
4093 // The frame record needs to be created by saving the appropriate registers
4094 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
4095 if (hasFP(MF) ||
4096 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
4097 SavedRegs.set(AArch64::FP);
4098 SavedRegs.set(AArch64::LR);
4099 }
4100
4101 LLVM_DEBUG({
4102 dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
4103 for (unsigned Reg : SavedRegs.set_bits())
4104 dbgs() << ' ' << printReg(Reg, RegInfo);
4105 dbgs() << "\n";
4106 });
4107
4108 // If any callee-saved registers are used, the frame cannot be eliminated.
4109 int64_t SVEStackSize =
4110 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
4111 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
4112
4113 // The CSR spill slots have not been allocated yet, so estimateStackSize
4114 // won't include them.
4115 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
4116
4117 // We may address some of the stack above the canonical frame address, either
4118 // for our own arguments or during a call. Include that in calculating whether
4119 // we have complicated addressing concerns.
4120 int64_t CalleeStackUsed = 0;
4121 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) {
4122 int64_t FixedOff = MFI.getObjectOffset(I);
4123 if (FixedOff > CalleeStackUsed)
4124 CalleeStackUsed = FixedOff;
4125 }
4126
4127 // Conservatively always assume BigStack when there are SVE spills.
4128 bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize +
4129 CalleeStackUsed) > EstimatedStackSizeLimit;
4130 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
4131 AFI->setHasStackFrame(true);
4132
4133 // Estimate if we might need to scavenge a register at some point in order
4134 // to materialize a stack offset. If so, either spill one additional
4135 // callee-saved register or reserve a special spill slot to facilitate
4136 // register scavenging. If we already spilled an extra callee-saved register
4137 // above to keep the number of spills even, we don't need to do anything else
4138 // here.
4139 if (BigStack) {
4140 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
4141 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
4142 << " to get a scratch register.\n");
4143 SavedRegs.set(UnspilledCSGPR);
4144 ExtraCSSpill = UnspilledCSGPR;
4145
4146 // MachO's compact unwind format relies on all registers being stored in
4147 // pairs, so if we need to spill one extra for BigStack, then we need to
4148 // store the pair.
4149 if (producePairRegisters(MF)) {
4150 if (UnspilledCSGPRPaired == AArch64::NoRegister) {
4151 // Failed to make a pair for compact unwind format, revert spilling.
4152 if (produceCompactUnwindFrame(MF)) {
4153 SavedRegs.reset(UnspilledCSGPR);
4154 ExtraCSSpill = AArch64::NoRegister;
4155 }
4156 } else
4157 SavedRegs.set(UnspilledCSGPRPaired);
4158 }
4159 }
4160
4161 // If we didn't find an extra callee-saved register to spill, create
4162 // an emergency spill slot.
4163 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
4164 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
4165 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
4166 unsigned Size = TRI->getSpillSize(RC);
4167 Align Alignment = TRI->getSpillAlign(RC);
4168 int FI = MFI.CreateSpillStackObject(Size, Alignment);
4169 RS->addScavengingFrameIndex(FI);
4170 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
4171 << " as the emergency spill slot.\n");
4172 }
4173 }
4174
4175 // Adding the size of additional 64bit GPR saves.
4176 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
4177
4178 // A Swift asynchronous context extends the frame record with a pointer
4179 // directly before FP.
4180 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
4181 CSStackSize += 8;
4182
4183 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
4184 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
4185 << EstimatedStackSize + AlignedCSStackSize << " bytes.\n");
4186
4187 assert((!MFI.isCalleeSavedInfoValid() ||
4188 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
4189 "Should not invalidate callee saved info");
4190
4191 // Round up to register pair alignment to avoid additional SP adjustment
4192 // instructions.
4193 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
4194 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
4195 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
4196 }
4197
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * RegInfo,std::vector<CalleeSavedInfo> & CSI,unsigned & MinCSFrameIndex,unsigned & MaxCSFrameIndex) const4198 bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
4199 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
4200 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
4201 unsigned &MaxCSFrameIndex) const {
4202 bool NeedsWinCFI = needsWinCFI(MF);
4203 unsigned StackHazardSize = getStackHazardSize(MF);
4204 // To match the canonical windows frame layout, reverse the list of
4205 // callee saved registers to get them laid out by PrologEpilogInserter
4206 // in the right order. (PrologEpilogInserter allocates stack objects top
4207 // down. Windows canonical prologs store higher numbered registers at
4208 // the top, thus have the CSI array start from the highest registers.)
4209 if (NeedsWinCFI)
4210 std::reverse(CSI.begin(), CSI.end());
4211
4212 if (CSI.empty())
4213 return true; // Early exit if no callee saved registers are modified!
4214
4215 // Now that we know which registers need to be saved and restored, allocate
4216 // stack slots for them.
4217 MachineFrameInfo &MFI = MF.getFrameInfo();
4218 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
4219
4220 bool UsesWinAAPCS = isTargetWindows(MF);
4221 if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
4222 int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
4223 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
4224 if ((unsigned)FrameIdx < MinCSFrameIndex)
4225 MinCSFrameIndex = FrameIdx;
4226 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4227 MaxCSFrameIndex = FrameIdx;
4228 }
4229
4230 // Insert VG into the list of CSRs, immediately before LR if saved.
4231 if (requiresSaveVG(MF)) {
4232 std::vector<CalleeSavedInfo> VGSaves;
4233 SMEAttrs Attrs = AFI->getSMEFnAttrs();
4234
4235 auto VGInfo = CalleeSavedInfo(AArch64::VG);
4236 VGInfo.setRestored(false);
4237 VGSaves.push_back(VGInfo);
4238
4239 // Add VG again if the function is locally-streaming, as we will spill two
4240 // values.
4241 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
4242 VGSaves.push_back(VGInfo);
4243
4244 bool InsertBeforeLR = false;
4245
4246 for (unsigned I = 0; I < CSI.size(); I++)
4247 if (CSI[I].getReg() == AArch64::LR) {
4248 InsertBeforeLR = true;
4249 CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
4250 break;
4251 }
4252
4253 if (!InsertBeforeLR)
4254 llvm::append_range(CSI, VGSaves);
4255 }
4256
4257 Register LastReg = 0;
4258 int HazardSlotIndex = std::numeric_limits<int>::max();
4259 for (auto &CS : CSI) {
4260 MCRegister Reg = CS.getReg();
4261 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
4262
4263 // Create a hazard slot as we switch between GPR and FPR CSRs.
4264 if (AFI->hasStackHazardSlotIndex() &&
4265 (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
4266 AArch64InstrInfo::isFpOrNEON(Reg)) {
4267 assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
4268 "Unexpected register order for hazard slot");
4269 HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
4270 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4271 << "\n");
4272 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4273 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4274 MinCSFrameIndex = HazardSlotIndex;
4275 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4276 MaxCSFrameIndex = HazardSlotIndex;
4277 }
4278
4279 unsigned Size = RegInfo->getSpillSize(*RC);
4280 Align Alignment(RegInfo->getSpillAlign(*RC));
4281 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
4282 CS.setFrameIdx(FrameIdx);
4283
4284 if ((unsigned)FrameIdx < MinCSFrameIndex)
4285 MinCSFrameIndex = FrameIdx;
4286 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4287 MaxCSFrameIndex = FrameIdx;
4288
4289 // Grab 8 bytes below FP for the extended asynchronous frame info.
4290 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
4291 Reg == AArch64::FP) {
4292 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
4293 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
4294 if ((unsigned)FrameIdx < MinCSFrameIndex)
4295 MinCSFrameIndex = FrameIdx;
4296 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4297 MaxCSFrameIndex = FrameIdx;
4298 }
4299 LastReg = Reg;
4300 }
4301
4302 // Add hazard slot in the case where no FPR CSRs are present.
4303 if (AFI->hasStackHazardSlotIndex() &&
4304 HazardSlotIndex == std::numeric_limits<int>::max()) {
4305 HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
4306 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4307 << "\n");
4308 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4309 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4310 MinCSFrameIndex = HazardSlotIndex;
4311 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4312 MaxCSFrameIndex = HazardSlotIndex;
4313 }
4314
4315 return true;
4316 }
4317
enableStackSlotScavenging(const MachineFunction & MF) const4318 bool AArch64FrameLowering::enableStackSlotScavenging(
4319 const MachineFunction &MF) const {
4320 const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4321 // If the function has streaming-mode changes, don't scavenge a
4322 // spillslot in the callee-save area, as that might require an
4323 // 'addvl' in the streaming-mode-changing call-sequence when the
4324 // function doesn't use a FP.
4325 if (AFI->hasStreamingModeChanges() && !hasFP(MF))
4326 return false;
4327 // Don't allow register salvaging with hazard slots, in case it moves objects
4328 // into the wrong place.
4329 if (AFI->hasStackHazardSlotIndex())
4330 return false;
4331 return AFI->hasCalleeSaveStackFreeSpace();
4332 }
4333
4334 /// returns true if there are any SVE callee saves.
getSVECalleeSaveSlotRange(const MachineFrameInfo & MFI,int & Min,int & Max)4335 static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
4336 int &Min, int &Max) {
4337 Min = std::numeric_limits<int>::max();
4338 Max = std::numeric_limits<int>::min();
4339
4340 if (!MFI.isCalleeSavedInfoValid())
4341 return false;
4342
4343 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
4344 for (auto &CS : CSI) {
4345 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
4346 AArch64::PPRRegClass.contains(CS.getReg())) {
4347 assert((Max == std::numeric_limits<int>::min() ||
4348 Max + 1 == CS.getFrameIdx()) &&
4349 "SVE CalleeSaves are not consecutive");
4350
4351 Min = std::min(Min, CS.getFrameIdx());
4352 Max = std::max(Max, CS.getFrameIdx());
4353 }
4354 }
4355 return Min != std::numeric_limits<int>::max();
4356 }
4357
4358 // Process all the SVE stack objects and determine offsets for each
4359 // object. If AssignOffsets is true, the offsets get assigned.
4360 // Fills in the first and last callee-saved frame indices into
4361 // Min/MaxCSFrameIndex, respectively.
4362 // Returns the size of the stack.
determineSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex,bool AssignOffsets)4363 static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
4364 int &MinCSFrameIndex,
4365 int &MaxCSFrameIndex,
4366 bool AssignOffsets) {
4367 #ifndef NDEBUG
4368 // First process all fixed stack objects.
4369 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
4370 assert(MFI.getStackID(I) != TargetStackID::ScalableVector &&
4371 "SVE vectors should never be passed on the stack by value, only by "
4372 "reference.");
4373 #endif
4374
4375 auto Assign = [&MFI](int FI, int64_t Offset) {
4376 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
4377 MFI.setObjectOffset(FI, Offset);
4378 };
4379
4380 int64_t Offset = 0;
4381
4382 // Then process all callee saved slots.
4383 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
4384 // Assign offsets to the callee save slots.
4385 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
4386 Offset += MFI.getObjectSize(I);
4387 Offset = alignTo(Offset, MFI.getObjectAlign(I));
4388 if (AssignOffsets)
4389 Assign(I, -Offset);
4390 }
4391 }
4392
4393 // Ensure that the Callee-save area is aligned to 16bytes.
4394 Offset = alignTo(Offset, Align(16U));
4395
4396 // Create a buffer of SVE objects to allocate and sort it.
4397 SmallVector<int, 8> ObjectsToAllocate;
4398 // If we have a stack protector, and we've previously decided that we have SVE
4399 // objects on the stack and thus need it to go in the SVE stack area, then it
4400 // needs to go first.
4401 int StackProtectorFI = -1;
4402 if (MFI.hasStackProtectorIndex()) {
4403 StackProtectorFI = MFI.getStackProtectorIndex();
4404 if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
4405 ObjectsToAllocate.push_back(StackProtectorFI);
4406 }
4407 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
4408 unsigned StackID = MFI.getStackID(I);
4409 if (StackID != TargetStackID::ScalableVector)
4410 continue;
4411 if (I == StackProtectorFI)
4412 continue;
4413 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
4414 continue;
4415 if (MFI.isDeadObjectIndex(I))
4416 continue;
4417
4418 ObjectsToAllocate.push_back(I);
4419 }
4420
4421 // Allocate all SVE locals and spills
4422 for (unsigned FI : ObjectsToAllocate) {
4423 Align Alignment = MFI.getObjectAlign(FI);
4424 // FIXME: Given that the length of SVE vectors is not necessarily a power of
4425 // two, we'd need to align every object dynamically at runtime if the
4426 // alignment is larger than 16. This is not yet supported.
4427 if (Alignment > Align(16))
4428 report_fatal_error(
4429 "Alignment of scalable vectors > 16 bytes is not yet supported");
4430
4431 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
4432 if (AssignOffsets)
4433 Assign(FI, -Offset);
4434 }
4435
4436 return Offset;
4437 }
4438
estimateSVEStackObjectOffsets(MachineFrameInfo & MFI) const4439 int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
4440 MachineFrameInfo &MFI) const {
4441 int MinCSFrameIndex, MaxCSFrameIndex;
4442 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
4443 }
4444
assignSVEStackObjectOffsets(MachineFrameInfo & MFI,int & MinCSFrameIndex,int & MaxCSFrameIndex) const4445 int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
4446 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
4447 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
4448 true);
4449 }
4450
4451 /// Attempts to scavenge a register from \p ScavengeableRegs given the used
4452 /// registers in \p UsedRegs.
tryScavengeRegister(LiveRegUnits const & UsedRegs,BitVector const & ScavengeableRegs,Register PreferredReg)4453 static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
4454 BitVector const &ScavengeableRegs,
4455 Register PreferredReg) {
4456 if (PreferredReg != AArch64::NoRegister && UsedRegs.available(PreferredReg))
4457 return PreferredReg;
4458 for (auto Reg : ScavengeableRegs.set_bits()) {
4459 if (UsedRegs.available(Reg))
4460 return Reg;
4461 }
4462 return AArch64::NoRegister;
4463 }
4464
4465 /// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
4466 /// \p MachineInstrs.
propagateFrameFlags(MachineInstr & SourceMI,ArrayRef<MachineInstr * > MachineInstrs)4467 static void propagateFrameFlags(MachineInstr &SourceMI,
4468 ArrayRef<MachineInstr *> MachineInstrs) {
4469 for (MachineInstr *MI : MachineInstrs) {
4470 if (SourceMI.getFlag(MachineInstr::FrameSetup))
4471 MI->setFlag(MachineInstr::FrameSetup);
4472 if (SourceMI.getFlag(MachineInstr::FrameDestroy))
4473 MI->setFlag(MachineInstr::FrameDestroy);
4474 }
4475 }
4476
4477 /// RAII helper class for scavenging or spilling a register. On construction
4478 /// attempts to find a free register of class \p RC (given \p UsedRegs and \p
4479 /// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
4480 /// MaybeSpillFI to free a register. The free'd register is returned via the \p
4481 /// FreeReg output parameter. On destruction, if there is a spill, its previous
4482 /// value is reloaded. The spilling and scavenging is only valid at the
4483 /// insertion point \p MBBI, this class should _not_ be used in places that
4484 /// create or manipulate basic blocks, moving the expected insertion point.
4485 struct ScopedScavengeOrSpill {
4486 ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
4487 ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
4488
ScopedScavengeOrSpillScopedScavengeOrSpill4489 ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
4490 MachineBasicBlock::iterator MBBI,
4491 Register SpillCandidate, const TargetRegisterClass &RC,
4492 LiveRegUnits const &UsedRegs,
4493 BitVector const &AllocatableRegs,
4494 std::optional<int> *MaybeSpillFI,
4495 Register PreferredReg = AArch64::NoRegister)
4496 : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
4497 *MF.getSubtarget().getInstrInfo())),
4498 TRI(*MF.getSubtarget().getRegisterInfo()) {
4499 FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs, PreferredReg);
4500 if (FreeReg != AArch64::NoRegister)
4501 return;
4502 assert(MaybeSpillFI && "Expected emergency spill slot FI information "
4503 "(attempted to spill in prologue/epilogue?)");
4504 if (!MaybeSpillFI->has_value()) {
4505 MachineFrameInfo &MFI = MF.getFrameInfo();
4506 *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
4507 TRI.getSpillAlign(RC));
4508 }
4509 FreeReg = SpillCandidate;
4510 SpillFI = MaybeSpillFI->value();
4511 TII.storeRegToStackSlot(MBB, MBBI, FreeReg, false, *SpillFI, &RC, &TRI,
4512 Register());
4513 }
4514
hasSpilledScopedScavengeOrSpill4515 bool hasSpilled() const { return SpillFI.has_value(); }
4516
4517 /// Returns the free register (found from scavenging or spilling a register).
freeRegisterScopedScavengeOrSpill4518 Register freeRegister() const { return FreeReg; }
4519
operator *ScopedScavengeOrSpill4520 Register operator*() const { return freeRegister(); }
4521
~ScopedScavengeOrSpillScopedScavengeOrSpill4522 ~ScopedScavengeOrSpill() {
4523 if (hasSpilled())
4524 TII.loadRegFromStackSlot(MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
4525 Register());
4526 }
4527
4528 private:
4529 MachineBasicBlock &MBB;
4530 MachineBasicBlock::iterator MBBI;
4531 const TargetRegisterClass &RC;
4532 const AArch64InstrInfo &TII;
4533 const TargetRegisterInfo &TRI;
4534 Register FreeReg = AArch64::NoRegister;
4535 std::optional<int> SpillFI;
4536 };
4537
4538 /// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
4539 /// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4540 struct EmergencyStackSlots {
4541 std::optional<int> ZPRSpillFI;
4542 std::optional<int> PPRSpillFI;
4543 std::optional<int> GPRSpillFI;
4544 };
4545
4546 /// Registers available for scavenging (ZPR, PPR3b, GPR).
4547 struct ScavengeableRegs {
4548 BitVector ZPRRegs;
4549 BitVector PPR3bRegs;
4550 BitVector GPRRegs;
4551 };
4552
isInPrologueOrEpilogue(const MachineInstr & MI)4553 static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
4554 return MI.getFlag(MachineInstr::FrameSetup) ||
4555 MI.getFlag(MachineInstr::FrameDestroy);
4556 }
4557
4558 /// Expands:
4559 /// ```
4560 /// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
4561 /// ```
4562 /// To:
4563 /// ```
4564 /// $z0 = CPY_ZPzI_B $p0, 1, 0
4565 /// STR_ZXI $z0, $stack.0, 0
4566 /// ```
4567 /// While ensuring a ZPR ($z0 in this example) is free for the predicate (
4568 /// spilling if necessary).
expandSpillPPRToZPRSlotPseudo(MachineBasicBlock & MBB,MachineInstr & MI,const TargetRegisterInfo & TRI,LiveRegUnits const & UsedRegs,ScavengeableRegs const & SR,EmergencyStackSlots & SpillSlots)4569 static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
4570 MachineInstr &MI,
4571 const TargetRegisterInfo &TRI,
4572 LiveRegUnits const &UsedRegs,
4573 ScavengeableRegs const &SR,
4574 EmergencyStackSlots &SpillSlots) {
4575 MachineFunction &MF = *MBB.getParent();
4576 auto *TII =
4577 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
4578
4579 ScopedScavengeOrSpill ZPredReg(
4580 MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
4581 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
4582
4583 SmallVector<MachineInstr *, 2> MachineInstrs;
4584 const DebugLoc &DL = MI.getDebugLoc();
4585 MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
4586 .addReg(*ZPredReg, RegState::Define)
4587 .add(MI.getOperand(0))
4588 .addImm(1)
4589 .addImm(0)
4590 .getInstr());
4591 MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
4592 .addReg(*ZPredReg)
4593 .add(MI.getOperand(1))
4594 .addImm(MI.getOperand(2).getImm())
4595 .setMemRefs(MI.memoperands())
4596 .getInstr());
4597 propagateFrameFlags(MI, MachineInstrs);
4598 }
4599
4600 /// Expands:
4601 /// ```
4602 /// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
4603 /// ```
4604 /// To:
4605 /// ```
4606 /// $z0 = LDR_ZXI %stack.0, 0
4607 /// $p0 = PTRUE_B 31, implicit $vg
4608 /// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
4609 /// ```
4610 /// While ensuring a ZPR ($z0 in this example) is free for the predicate (
4611 /// spilling if necessary). If the status flags are in use at the point of
4612 /// expansion they are preserved (by moving them to/from a GPR). This may cause
4613 /// an additional spill if no GPR is free at the expansion point.
expandFillPPRFromZPRSlotPseudo(MachineBasicBlock & MBB,MachineInstr & MI,const TargetRegisterInfo & TRI,LiveRegUnits const & UsedRegs,ScavengeableRegs const & SR,MachineInstr * & LastPTrue,EmergencyStackSlots & SpillSlots)4614 static bool expandFillPPRFromZPRSlotPseudo(
4615 MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
4616 LiveRegUnits const &UsedRegs, ScavengeableRegs const &SR,
4617 MachineInstr *&LastPTrue, EmergencyStackSlots &SpillSlots) {
4618 MachineFunction &MF = *MBB.getParent();
4619 auto *TII =
4620 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
4621
4622 ScopedScavengeOrSpill ZPredReg(
4623 MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs,
4624 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
4625
4626 ScopedScavengeOrSpill PredReg(
4627 MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs,
4628 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI,
4629 /*PreferredReg=*/
4630 LastPTrue ? LastPTrue->getOperand(0).getReg() : AArch64::NoRegister);
4631
4632 // Elide NZCV spills if we know it is not used.
4633 bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
4634 std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
4635 if (IsNZCVUsed)
4636 NZCVSaveReg.emplace(
4637 MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs,
4638 isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
4639 SmallVector<MachineInstr *, 4> MachineInstrs;
4640 const DebugLoc &DL = MI.getDebugLoc();
4641 MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
4642 .addReg(*ZPredReg, RegState::Define)
4643 .add(MI.getOperand(1))
4644 .addImm(MI.getOperand(2).getImm())
4645 .setMemRefs(MI.memoperands())
4646 .getInstr());
4647 if (IsNZCVUsed)
4648 MachineInstrs.push_back(
4649 BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
4650 .addReg(NZCVSaveReg->freeRegister(), RegState::Define)
4651 .addImm(AArch64SysReg::NZCV)
4652 .addReg(AArch64::NZCV, RegState::Implicit)
4653 .getInstr());
4654
4655 // Reuse previous ptrue if we know it has not been clobbered.
4656 if (LastPTrue) {
4657 assert(*PredReg == LastPTrue->getOperand(0).getReg());
4658 LastPTrue->moveBefore(&MI);
4659 } else {
4660 LastPTrue = BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
4661 .addReg(*PredReg, RegState::Define)
4662 .addImm(31);
4663 }
4664 MachineInstrs.push_back(LastPTrue);
4665 MachineInstrs.push_back(
4666 BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
4667 .addReg(MI.getOperand(0).getReg(), RegState::Define)
4668 .addReg(*PredReg)
4669 .addReg(*ZPredReg)
4670 .addImm(0)
4671 .addReg(AArch64::NZCV, RegState::ImplicitDefine)
4672 .getInstr());
4673 if (IsNZCVUsed)
4674 MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
4675 .addImm(AArch64SysReg::NZCV)
4676 .addReg(NZCVSaveReg->freeRegister())
4677 .addReg(AArch64::NZCV, RegState::ImplicitDefine)
4678 .getInstr());
4679
4680 propagateFrameFlags(MI, MachineInstrs);
4681 return PredReg.hasSpilled();
4682 }
4683
4684 /// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
4685 /// operations within the MachineBasicBlock \p MBB.
expandSMEPPRToZPRSpillPseudos(MachineBasicBlock & MBB,const TargetRegisterInfo & TRI,ScavengeableRegs const & SR,EmergencyStackSlots & SpillSlots)4686 static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
4687 const TargetRegisterInfo &TRI,
4688 ScavengeableRegs const &SR,
4689 EmergencyStackSlots &SpillSlots) {
4690 LiveRegUnits UsedRegs(TRI);
4691 UsedRegs.addLiveOuts(MBB);
4692 bool HasPPRSpills = false;
4693 MachineInstr *LastPTrue = nullptr;
4694 for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
4695 UsedRegs.stepBackward(MI);
4696 switch (MI.getOpcode()) {
4697 case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4698 if (LastPTrue &&
4699 MI.definesRegister(LastPTrue->getOperand(0).getReg(), &TRI))
4700 LastPTrue = nullptr;
4701 HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR,
4702 LastPTrue, SpillSlots);
4703 MI.eraseFromParent();
4704 break;
4705 case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4706 expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, SR, SpillSlots);
4707 MI.eraseFromParent();
4708 [[fallthrough]];
4709 default:
4710 LastPTrue = nullptr;
4711 break;
4712 }
4713 }
4714
4715 return HasPPRSpills;
4716 }
4717
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const4718 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
4719 MachineFunction &MF, RegScavenger *RS) const {
4720
4721 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
4722 const TargetSubtargetInfo &TSI = MF.getSubtarget();
4723 const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
4724
4725 // If predicates spills are 16-bytes we may need to expand
4726 // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4727 if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
4728 auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
4729 BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
4730 assert(Regs.count() > 0 && "Expected scavengeable registers");
4731 return Regs;
4732 };
4733
4734 ScavengeableRegs SR{};
4735 SR.ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
4736 // Only p0-7 are possible as the second operand of cmpne (needed for fills).
4737 SR.PPR3bRegs = ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
4738 SR.GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
4739
4740 EmergencyStackSlots SpillSlots;
4741 for (MachineBasicBlock &MBB : MF) {
4742 // In the case we had to spill a predicate (in the range p0-p7) to reload
4743 // a predicate (>= p8), additional spill/fill pseudos will be created.
4744 // These need an additional expansion pass. Note: There will only be at
4745 // most two expansion passes, as spilling/filling a predicate in the range
4746 // p0-p7 never requires spilling another predicate.
4747 for (int Pass = 0; Pass < 2; Pass++) {
4748 bool HasPPRSpills =
4749 expandSMEPPRToZPRSpillPseudos(MBB, TRI, SR, SpillSlots);
4750 assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
4751 if (!HasPPRSpills)
4752 break;
4753 }
4754 }
4755 }
4756
4757 MachineFrameInfo &MFI = MF.getFrameInfo();
4758
4759 assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
4760 "Upwards growing stack unsupported");
4761
4762 int MinCSFrameIndex, MaxCSFrameIndex;
4763 int64_t SVEStackSize =
4764 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
4765
4766 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
4767 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
4768
4769 // If this function isn't doing Win64-style C++ EH, we don't need to do
4770 // anything.
4771 if (!MF.hasEHFunclets())
4772 return;
4773
4774 // Win64 C++ EH needs to allocate space for the catch objects in the fixed
4775 // object area right next to the UnwindHelp object.
4776 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4777 int64_t CurrentOffset =
4778 AFI->getVarArgsGPRSize() + AFI->getTailCallReservedStack();
4779 for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
4780 for (WinEHHandlerType &H : TBME.HandlerArray) {
4781 int FrameIndex = H.CatchObj.FrameIndex;
4782 if ((FrameIndex != INT_MAX) && MFI.getObjectOffset(FrameIndex) == 0) {
4783 CurrentOffset =
4784 alignTo(CurrentOffset, MFI.getObjectAlign(FrameIndex).value());
4785 CurrentOffset += MFI.getObjectSize(FrameIndex);
4786 MFI.setObjectOffset(FrameIndex, -CurrentOffset);
4787 }
4788 }
4789 }
4790
4791 // Create an UnwindHelp object.
4792 // The UnwindHelp object is allocated at the start of the fixed object area
4793 int64_t UnwindHelpOffset = alignTo(CurrentOffset + 8, Align(16));
4794 assert(UnwindHelpOffset == getFixedObjectSize(MF, AFI, /*IsWin64*/ true,
4795 /*IsFunclet*/ false) &&
4796 "UnwindHelpOffset must be at the start of the fixed object area");
4797 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, -UnwindHelpOffset,
4798 /*IsImmutable=*/false);
4799 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4800
4801 MachineBasicBlock &MBB = MF.front();
4802 auto MBBI = MBB.begin();
4803 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
4804 ++MBBI;
4805
4806 // We need to store -2 into the UnwindHelp object at the start of the
4807 // function.
4808 DebugLoc DL;
4809 RS->enterBasicBlockEnd(MBB);
4810 RS->backward(MBBI);
4811 Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
4812 assert(DstReg && "There must be a free register after frame setup");
4813 const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
4814 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
4815 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
4816 .addReg(DstReg, getKillRegState(true))
4817 .addFrameIndex(UnwindHelpFI)
4818 .addImm(0);
4819 }
4820
4821 namespace {
4822 struct TagStoreInstr {
4823 MachineInstr *MI;
4824 int64_t Offset, Size;
TagStoreInstr__anonc2fd70991211::TagStoreInstr4825 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
4826 : MI(MI), Offset(Offset), Size(Size) {}
4827 };
4828
4829 class TagStoreEdit {
4830 MachineFunction *MF;
4831 MachineBasicBlock *MBB;
4832 MachineRegisterInfo *MRI;
4833 // Tag store instructions that are being replaced.
4834 SmallVector<TagStoreInstr, 8> TagStores;
4835 // Combined memref arguments of the above instructions.
4836 SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
4837
4838 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
4839 // FrameRegOffset + Size) with the address tag of SP.
4840 Register FrameReg;
4841 StackOffset FrameRegOffset;
4842 int64_t Size;
4843 // If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the
4844 // end.
4845 std::optional<int64_t> FrameRegUpdate;
4846 // MIFlags for any FrameReg updating instructions.
4847 unsigned FrameRegUpdateFlags;
4848
4849 // Use zeroing instruction variants.
4850 bool ZeroData;
4851 DebugLoc DL;
4852
4853 void emitUnrolled(MachineBasicBlock::iterator InsertI);
4854 void emitLoop(MachineBasicBlock::iterator InsertI);
4855
4856 public:
TagStoreEdit(MachineBasicBlock * MBB,bool ZeroData)4857 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
4858 : MBB(MBB), ZeroData(ZeroData) {
4859 MF = MBB->getParent();
4860 MRI = &MF->getRegInfo();
4861 }
4862 // Add an instruction to be replaced. Instructions must be added in the
4863 // ascending order of Offset, and have to be adjacent.
addInstruction(TagStoreInstr I)4864 void addInstruction(TagStoreInstr I) {
4865 assert((TagStores.empty() ||
4866 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
4867 "Non-adjacent tag store instructions.");
4868 TagStores.push_back(I);
4869 }
clear()4870 void clear() { TagStores.clear(); }
4871 // Emit equivalent code at the given location, and erase the current set of
4872 // instructions. May skip if the replacement is not profitable. May invalidate
4873 // the input iterator and replace it with a valid one.
4874 void emitCode(MachineBasicBlock::iterator &InsertI,
4875 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
4876 };
4877
emitUnrolled(MachineBasicBlock::iterator InsertI)4878 void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
4879 const AArch64InstrInfo *TII =
4880 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4881
4882 const int64_t kMinOffset = -256 * 16;
4883 const int64_t kMaxOffset = 255 * 16;
4884
4885 Register BaseReg = FrameReg;
4886 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
4887 if (BaseRegOffsetBytes < kMinOffset ||
4888 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset ||
4889 // BaseReg can be FP, which is not necessarily aligned to 16-bytes. In
4890 // that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which
4891 // is required for the offset of ST2G.
4892 BaseRegOffsetBytes % 16 != 0) {
4893 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4894 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
4895 StackOffset::getFixed(BaseRegOffsetBytes), TII);
4896 BaseReg = ScratchReg;
4897 BaseRegOffsetBytes = 0;
4898 }
4899
4900 MachineInstr *LastI = nullptr;
4901 while (Size) {
4902 int64_t InstrSize = (Size > 16) ? 32 : 16;
4903 unsigned Opcode =
4904 InstrSize == 16
4905 ? (ZeroData ? AArch64::STZGi : AArch64::STGi)
4906 : (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi);
4907 assert(BaseRegOffsetBytes % 16 == 0);
4908 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
4909 .addReg(AArch64::SP)
4910 .addReg(BaseReg)
4911 .addImm(BaseRegOffsetBytes / 16)
4912 .setMemRefs(CombinedMemRefs);
4913 // A store to [BaseReg, #0] should go last for an opportunity to fold the
4914 // final SP adjustment in the epilogue.
4915 if (BaseRegOffsetBytes == 0)
4916 LastI = I;
4917 BaseRegOffsetBytes += InstrSize;
4918 Size -= InstrSize;
4919 }
4920
4921 if (LastI)
4922 MBB->splice(InsertI, MBB, LastI);
4923 }
4924
emitLoop(MachineBasicBlock::iterator InsertI)4925 void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
4926 const AArch64InstrInfo *TII =
4927 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4928
4929 Register BaseReg = FrameRegUpdate
4930 ? FrameReg
4931 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4932 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4933
4934 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
4935
4936 int64_t LoopSize = Size;
4937 // If the loop size is not a multiple of 32, split off one 16-byte store at
4938 // the end to fold BaseReg update into.
4939 if (FrameRegUpdate && *FrameRegUpdate)
4940 LoopSize -= LoopSize % 32;
4941 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
4942 TII->get(ZeroData ? AArch64::STZGloop_wback
4943 : AArch64::STGloop_wback))
4944 .addDef(SizeReg)
4945 .addDef(BaseReg)
4946 .addImm(LoopSize)
4947 .addReg(BaseReg)
4948 .setMemRefs(CombinedMemRefs);
4949 if (FrameRegUpdate)
4950 LoopI->setFlags(FrameRegUpdateFlags);
4951
4952 int64_t ExtraBaseRegUpdate =
4953 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
4954 LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize
4955 << ", Size=" << Size
4956 << ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate
4957 << ", FrameRegUpdate=" << FrameRegUpdate
4958 << ", FrameRegOffset.getFixed()="
4959 << FrameRegOffset.getFixed() << "\n");
4960 if (LoopSize < Size) {
4961 assert(FrameRegUpdate);
4962 assert(Size - LoopSize == 16);
4963 // Tag 16 more bytes at BaseReg and update BaseReg.
4964 int64_t STGOffset = ExtraBaseRegUpdate + 16;
4965 assert(STGOffset % 16 == 0 && STGOffset >= -4096 && STGOffset <= 4080 &&
4966 "STG immediate out of range");
4967 BuildMI(*MBB, InsertI, DL,
4968 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
4969 .addDef(BaseReg)
4970 .addReg(BaseReg)
4971 .addReg(BaseReg)
4972 .addImm(STGOffset / 16)
4973 .setMemRefs(CombinedMemRefs)
4974 .setMIFlags(FrameRegUpdateFlags);
4975 } else if (ExtraBaseRegUpdate) {
4976 // Update BaseReg.
4977 int64_t AddSubOffset = std::abs(ExtraBaseRegUpdate);
4978 assert(AddSubOffset <= 4095 && "ADD/SUB immediate out of range");
4979 BuildMI(
4980 *MBB, InsertI, DL,
4981 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
4982 .addDef(BaseReg)
4983 .addReg(BaseReg)
4984 .addImm(AddSubOffset)
4985 .addImm(0)
4986 .setMIFlags(FrameRegUpdateFlags);
4987 }
4988 }
4989
4990 // Check if *II is a register update that can be merged into STGloop that ends
4991 // at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
4992 // end of the loop.
canMergeRegUpdate(MachineBasicBlock::iterator II,unsigned Reg,int64_t Size,int64_t * TotalOffset)4993 bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
4994 int64_t Size, int64_t *TotalOffset) {
4995 MachineInstr &MI = *II;
4996 if ((MI.getOpcode() == AArch64::ADDXri ||
4997 MI.getOpcode() == AArch64::SUBXri) &&
4998 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
4999 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
5000 int64_t Offset = MI.getOperand(2).getImm() << Shift;
5001 if (MI.getOpcode() == AArch64::SUBXri)
5002 Offset = -Offset;
5003 int64_t PostOffset = Offset - Size;
5004 // TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or
5005 // an STGPostIndex which does the last 16 bytes of tag write. Which one is
5006 // chosen depends on the alignment of the loop size, but the difference
5007 // between the valid ranges for the two instructions is small, so we
5008 // conservatively assume that it could be either case here.
5009 //
5010 // Max offset of STGPostIndex, minus the 16 byte tag write folded into that
5011 // instruction.
5012 const int64_t kMaxOffset = 4080 - 16;
5013 // Max offset of SUBXri.
5014 const int64_t kMinOffset = -4095;
5015 if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset &&
5016 PostOffset % 16 == 0) {
5017 *TotalOffset = Offset;
5018 return true;
5019 }
5020 }
5021 return false;
5022 }
5023
mergeMemRefs(const SmallVectorImpl<TagStoreInstr> & TSE,SmallVectorImpl<MachineMemOperand * > & MemRefs)5024 void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
5025 SmallVectorImpl<MachineMemOperand *> &MemRefs) {
5026 MemRefs.clear();
5027 for (auto &TS : TSE) {
5028 MachineInstr *MI = TS.MI;
5029 // An instruction without memory operands may access anything. Be
5030 // conservative and return an empty list.
5031 if (MI->memoperands_empty()) {
5032 MemRefs.clear();
5033 return;
5034 }
5035 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
5036 }
5037 }
5038
emitCode(MachineBasicBlock::iterator & InsertI,const AArch64FrameLowering * TFI,bool TryMergeSPUpdate)5039 void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
5040 const AArch64FrameLowering *TFI,
5041 bool TryMergeSPUpdate) {
5042 if (TagStores.empty())
5043 return;
5044 TagStoreInstr &FirstTagStore = TagStores[0];
5045 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
5046 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
5047 DL = TagStores[0].MI->getDebugLoc();
5048
5049 Register Reg;
5050 FrameRegOffset = TFI->resolveFrameOffsetReference(
5051 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
5052 /*PreferFP=*/false, /*ForSimm=*/true);
5053 FrameReg = Reg;
5054 FrameRegUpdate = std::nullopt;
5055
5056 mergeMemRefs(TagStores, CombinedMemRefs);
5057
5058 LLVM_DEBUG({
5059 dbgs() << "Replacing adjacent STG instructions:\n";
5060 for (const auto &Instr : TagStores) {
5061 dbgs() << " " << *Instr.MI;
5062 }
5063 });
5064
5065 // Size threshold where a loop becomes shorter than a linear sequence of
5066 // tagging instructions.
5067 const int kSetTagLoopThreshold = 176;
5068 if (Size < kSetTagLoopThreshold) {
5069 if (TagStores.size() < 2)
5070 return;
5071 emitUnrolled(InsertI);
5072 } else {
5073 MachineInstr *UpdateInstr = nullptr;
5074 int64_t TotalOffset = 0;
5075 if (TryMergeSPUpdate) {
5076 // See if we can merge base register update into the STGloop.
5077 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
5078 // but STGloop is way too unusual for that, and also it only
5079 // realistically happens in function epilogue. Also, STGloop is expanded
5080 // before that pass.
5081 if (InsertI != MBB->end() &&
5082 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
5083 &TotalOffset)) {
5084 UpdateInstr = &*InsertI++;
5085 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
5086 << *UpdateInstr);
5087 }
5088 }
5089
5090 if (!UpdateInstr && TagStores.size() < 2)
5091 return;
5092
5093 if (UpdateInstr) {
5094 FrameRegUpdate = TotalOffset;
5095 FrameRegUpdateFlags = UpdateInstr->getFlags();
5096 }
5097 emitLoop(InsertI);
5098 if (UpdateInstr)
5099 UpdateInstr->eraseFromParent();
5100 }
5101
5102 for (auto &TS : TagStores)
5103 TS.MI->eraseFromParent();
5104 }
5105
isMergeableStackTaggingInstruction(MachineInstr & MI,int64_t & Offset,int64_t & Size,bool & ZeroData)5106 bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
5107 int64_t &Size, bool &ZeroData) {
5108 MachineFunction &MF = *MI.getParent()->getParent();
5109 const MachineFrameInfo &MFI = MF.getFrameInfo();
5110
5111 unsigned Opcode = MI.getOpcode();
5112 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGi ||
5113 Opcode == AArch64::STZ2Gi);
5114
5115 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
5116 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
5117 return false;
5118 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
5119 return false;
5120 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
5121 Size = MI.getOperand(2).getImm();
5122 return true;
5123 }
5124
5125 if (Opcode == AArch64::STGi || Opcode == AArch64::STZGi)
5126 Size = 16;
5127 else if (Opcode == AArch64::ST2Gi || Opcode == AArch64::STZ2Gi)
5128 Size = 32;
5129 else
5130 return false;
5131
5132 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
5133 return false;
5134
5135 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
5136 16 * MI.getOperand(2).getImm();
5137 return true;
5138 }
5139
5140 // Detect a run of memory tagging instructions for adjacent stack frame slots,
5141 // and replace them with a shorter instruction sequence:
5142 // * replace STG + STG with ST2G
5143 // * replace STGloop + STGloop with STGloop
5144 // This code needs to run when stack slot offsets are already known, but before
5145 // FrameIndex operands in STG instructions are eliminated.
tryMergeAdjacentSTG(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI,RegScavenger * RS)5146 MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
5147 const AArch64FrameLowering *TFI,
5148 RegScavenger *RS) {
5149 bool FirstZeroData;
5150 int64_t Size, Offset;
5151 MachineInstr &MI = *II;
5152 MachineBasicBlock *MBB = MI.getParent();
5153 MachineBasicBlock::iterator NextI = ++II;
5154 if (&MI == &MBB->instr_back())
5155 return II;
5156 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
5157 return II;
5158
5159 SmallVector<TagStoreInstr, 4> Instrs;
5160 Instrs.emplace_back(&MI, Offset, Size);
5161
5162 constexpr int kScanLimit = 10;
5163 int Count = 0;
5164 for (MachineBasicBlock::iterator E = MBB->end();
5165 NextI != E && Count < kScanLimit; ++NextI) {
5166 MachineInstr &MI = *NextI;
5167 bool ZeroData;
5168 int64_t Size, Offset;
5169 // Collect instructions that update memory tags with a FrameIndex operand
5170 // and (when applicable) constant size, and whose output registers are dead
5171 // (the latter is almost always the case in practice). Since these
5172 // instructions effectively have no inputs or outputs, we are free to skip
5173 // any non-aliasing instructions in between without tracking used registers.
5174 if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
5175 if (ZeroData != FirstZeroData)
5176 break;
5177 Instrs.emplace_back(&MI, Offset, Size);
5178 continue;
5179 }
5180
5181 // Only count non-transient, non-tagging instructions toward the scan
5182 // limit.
5183 if (!MI.isTransient())
5184 ++Count;
5185
5186 // Just in case, stop before the epilogue code starts.
5187 if (MI.getFlag(MachineInstr::FrameSetup) ||
5188 MI.getFlag(MachineInstr::FrameDestroy))
5189 break;
5190
5191 // Reject anything that may alias the collected instructions.
5192 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() || MI.isCall())
5193 break;
5194 }
5195
5196 // New code will be inserted after the last tagging instruction we've found.
5197 MachineBasicBlock::iterator InsertI = Instrs.back().MI;
5198
5199 // All the gathered stack tag instructions are merged and placed after
5200 // last tag store in the list. The check should be made if the nzcv
5201 // flag is live at the point where we are trying to insert. Otherwise
5202 // the nzcv flag might get clobbered if any stg loops are present.
5203
5204 // FIXME : This approach of bailing out from merge is conservative in
5205 // some ways like even if stg loops are not present after merge the
5206 // insert list, this liveness check is done (which is not needed).
5207 LivePhysRegs LiveRegs(*(MBB->getParent()->getSubtarget().getRegisterInfo()));
5208 LiveRegs.addLiveOuts(*MBB);
5209 for (auto I = MBB->rbegin();; ++I) {
5210 MachineInstr &MI = *I;
5211 if (MI == InsertI)
5212 break;
5213 LiveRegs.stepBackward(*I);
5214 }
5215 InsertI++;
5216 if (LiveRegs.contains(AArch64::NZCV))
5217 return InsertI;
5218
5219 llvm::stable_sort(Instrs,
5220 [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
5221 return Left.Offset < Right.Offset;
5222 });
5223
5224 // Make sure that we don't have any overlapping stores.
5225 int64_t CurOffset = Instrs[0].Offset;
5226 for (auto &Instr : Instrs) {
5227 if (CurOffset > Instr.Offset)
5228 return NextI;
5229 CurOffset = Instr.Offset + Instr.Size;
5230 }
5231
5232 // Find contiguous runs of tagged memory and emit shorter instruction
5233 // sequences for them when possible.
5234 TagStoreEdit TSE(MBB, FirstZeroData);
5235 std::optional<int64_t> EndOffset;
5236 for (auto &Instr : Instrs) {
5237 if (EndOffset && *EndOffset != Instr.Offset) {
5238 // Found a gap.
5239 TSE.emitCode(InsertI, TFI, /*TryMergeSPUpdate = */ false);
5240 TSE.clear();
5241 }
5242
5243 TSE.addInstruction(Instr);
5244 EndOffset = Instr.Offset + Instr.Size;
5245 }
5246
5247 const MachineFunction *MF = MBB->getParent();
5248 // Multiple FP/SP updates in a loop cannot be described by CFI instructions.
5249 TSE.emitCode(
5250 InsertI, TFI, /*TryMergeSPUpdate = */
5251 !MF->getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(*MF));
5252
5253 return InsertI;
5254 }
5255 } // namespace
5256
emitVGSaveRestore(MachineBasicBlock::iterator II,const AArch64FrameLowering * TFI)5257 static void emitVGSaveRestore(MachineBasicBlock::iterator II,
5258 const AArch64FrameLowering *TFI) {
5259 MachineInstr &MI = *II;
5260 MachineBasicBlock *MBB = MI.getParent();
5261 MachineFunction *MF = MBB->getParent();
5262
5263 if (MI.getOpcode() != AArch64::VGSavePseudo &&
5264 MI.getOpcode() != AArch64::VGRestorePseudo)
5265 return;
5266
5267 auto *AFI = MF->getInfo<AArch64FunctionInfo>();
5268 SMEAttrs FuncAttrs = AFI->getSMEFnAttrs();
5269 bool LocallyStreaming =
5270 FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
5271
5272 int64_t VGFrameIdx =
5273 LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
5274 assert(VGFrameIdx != std::numeric_limits<int>::max() &&
5275 "Expected FrameIdx for VG");
5276
5277 CFIInstBuilder CFIBuilder(*MBB, II, MachineInstr::NoFlags);
5278 if (MI.getOpcode() == AArch64::VGSavePseudo) {
5279 const MachineFrameInfo &MFI = MF->getFrameInfo();
5280 int64_t Offset =
5281 MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
5282 CFIBuilder.buildOffset(AArch64::VG, Offset);
5283 } else {
5284 CFIBuilder.buildRestore(AArch64::VG);
5285 }
5286
5287 MI.eraseFromParent();
5288 }
5289
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS=nullptr) const5290 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
5291 MachineFunction &MF, RegScavenger *RS = nullptr) const {
5292 for (auto &BB : MF)
5293 for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
5294 if (requiresSaveVG(MF))
5295 emitVGSaveRestore(II++, this);
5296 else if (StackTaggingMergeSetTag)
5297 II = tryMergeAdjacentSTG(II, this, RS);
5298 }
5299
5300 // By the time this method is called, most of the prologue/epilogue code is
5301 // already emitted, whether its location was affected by the shrink-wrapping
5302 // optimization or not.
5303 if (!MF.getFunction().hasFnAttribute(Attribute::Naked) &&
5304 shouldSignReturnAddressEverywhere(MF))
5305 emitPacRetPlusLeafHardening(MF);
5306 }
5307
5308 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
5309 /// before the update. This is easily retrieved as it is exactly the offset
5310 /// that is set in processFunctionBeforeFrameFinalized.
getFrameIndexReferencePreferSP(const MachineFunction & MF,int FI,Register & FrameReg,bool IgnoreSPUpdates) const5311 StackOffset AArch64FrameLowering::getFrameIndexReferencePreferSP(
5312 const MachineFunction &MF, int FI, Register &FrameReg,
5313 bool IgnoreSPUpdates) const {
5314 const MachineFrameInfo &MFI = MF.getFrameInfo();
5315 if (IgnoreSPUpdates) {
5316 LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
5317 << MFI.getObjectOffset(FI) << "\n");
5318 FrameReg = AArch64::SP;
5319 return StackOffset::getFixed(MFI.getObjectOffset(FI));
5320 }
5321
5322 // Go to common code if we cannot provide sp + offset.
5323 if (MFI.hasVarSizedObjects() ||
5324 MF.getInfo<AArch64FunctionInfo>()->getStackSizeSVE() ||
5325 MF.getSubtarget().getRegisterInfo()->hasStackRealignment(MF))
5326 return getFrameIndexReference(MF, FI, FrameReg);
5327
5328 FrameReg = AArch64::SP;
5329 return getStackOffset(MF, MFI.getObjectOffset(FI));
5330 }
5331
5332 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
5333 /// the parent's frame pointer
getWinEHParentFrameOffset(const MachineFunction & MF) const5334 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
5335 const MachineFunction &MF) const {
5336 return 0;
5337 }
5338
5339 /// Funclets only need to account for space for the callee saved registers,
5340 /// as the locals are accounted for in the parent's stack frame.
getWinEHFuncletFrameSize(const MachineFunction & MF) const5341 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
5342 const MachineFunction &MF) const {
5343 // This is the size of the pushed CSRs.
5344 unsigned CSSize =
5345 MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
5346 // This is the amount of stack a funclet needs to allocate.
5347 return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
5348 getStackAlign());
5349 }
5350
5351 namespace {
5352 struct FrameObject {
5353 bool IsValid = false;
5354 // Index of the object in MFI.
5355 int ObjectIndex = 0;
5356 // Group ID this object belongs to.
5357 int GroupIndex = -1;
5358 // This object should be placed first (closest to SP).
5359 bool ObjectFirst = false;
5360 // This object's group (which always contains the object with
5361 // ObjectFirst==true) should be placed first.
5362 bool GroupFirst = false;
5363
5364 // Used to distinguish between FP and GPR accesses. The values are decided so
5365 // that they sort FPR < Hazard < GPR and they can be or'd together.
5366 unsigned Accesses = 0;
5367 enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 };
5368 };
5369
5370 class GroupBuilder {
5371 SmallVector<int, 8> CurrentMembers;
5372 int NextGroupIndex = 0;
5373 std::vector<FrameObject> &Objects;
5374
5375 public:
GroupBuilder(std::vector<FrameObject> & Objects)5376 GroupBuilder(std::vector<FrameObject> &Objects) : Objects(Objects) {}
AddMember(int Index)5377 void AddMember(int Index) { CurrentMembers.push_back(Index); }
EndCurrentGroup()5378 void EndCurrentGroup() {
5379 if (CurrentMembers.size() > 1) {
5380 // Create a new group with the current member list. This might remove them
5381 // from their pre-existing groups. That's OK, dealing with overlapping
5382 // groups is too hard and unlikely to make a difference.
5383 LLVM_DEBUG(dbgs() << "group:");
5384 for (int Index : CurrentMembers) {
5385 Objects[Index].GroupIndex = NextGroupIndex;
5386 LLVM_DEBUG(dbgs() << " " << Index);
5387 }
5388 LLVM_DEBUG(dbgs() << "\n");
5389 NextGroupIndex++;
5390 }
5391 CurrentMembers.clear();
5392 }
5393 };
5394
FrameObjectCompare(const FrameObject & A,const FrameObject & B)5395 bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
5396 // Objects at a lower index are closer to FP; objects at a higher index are
5397 // closer to SP.
5398 //
5399 // For consistency in our comparison, all invalid objects are placed
5400 // at the end. This also allows us to stop walking when we hit the
5401 // first invalid item after it's all sorted.
5402 //
5403 // If we want to include a stack hazard region, order FPR accesses < the
5404 // hazard object < GPRs accesses in order to create a separation between the
5405 // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
5406 //
5407 // Otherwise the "first" object goes first (closest to SP), followed by the
5408 // members of the "first" group.
5409 //
5410 // The rest are sorted by the group index to keep the groups together.
5411 // Higher numbered groups are more likely to be around longer (i.e. untagged
5412 // in the function epilogue and not at some earlier point). Place them closer
5413 // to SP.
5414 //
5415 // If all else equal, sort by the object index to keep the objects in the
5416 // original order.
5417 return std::make_tuple(!A.IsValid, A.Accesses, A.ObjectFirst, A.GroupFirst,
5418 A.GroupIndex, A.ObjectIndex) <
5419 std::make_tuple(!B.IsValid, B.Accesses, B.ObjectFirst, B.GroupFirst,
5420 B.GroupIndex, B.ObjectIndex);
5421 }
5422 } // namespace
5423
orderFrameObjects(const MachineFunction & MF,SmallVectorImpl<int> & ObjectsToAllocate) const5424 void AArch64FrameLowering::orderFrameObjects(
5425 const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
5426 if (!OrderFrameObjects || ObjectsToAllocate.empty())
5427 return;
5428
5429 const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
5430 const MachineFrameInfo &MFI = MF.getFrameInfo();
5431 std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
5432 for (auto &Obj : ObjectsToAllocate) {
5433 FrameObjects[Obj].IsValid = true;
5434 FrameObjects[Obj].ObjectIndex = Obj;
5435 }
5436
5437 // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
5438 // the same time.
5439 GroupBuilder GB(FrameObjects);
5440 for (auto &MBB : MF) {
5441 for (auto &MI : MBB) {
5442 if (MI.isDebugInstr())
5443 continue;
5444
5445 if (AFI.hasStackHazardSlotIndex()) {
5446 std::optional<int> FI = getLdStFrameID(MI, MFI);
5447 if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
5448 if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
5449 AArch64InstrInfo::isFpOrNEON(MI))
5450 FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
5451 else
5452 FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
5453 }
5454 }
5455
5456 int OpIndex;
5457 switch (MI.getOpcode()) {
5458 case AArch64::STGloop:
5459 case AArch64::STZGloop:
5460 OpIndex = 3;
5461 break;
5462 case AArch64::STGi:
5463 case AArch64::STZGi:
5464 case AArch64::ST2Gi:
5465 case AArch64::STZ2Gi:
5466 OpIndex = 1;
5467 break;
5468 default:
5469 OpIndex = -1;
5470 }
5471
5472 int TaggedFI = -1;
5473 if (OpIndex >= 0) {
5474 const MachineOperand &MO = MI.getOperand(OpIndex);
5475 if (MO.isFI()) {
5476 int FI = MO.getIndex();
5477 if (FI >= 0 && FI < MFI.getObjectIndexEnd() &&
5478 FrameObjects[FI].IsValid)
5479 TaggedFI = FI;
5480 }
5481 }
5482
5483 // If this is a stack tagging instruction for a slot that is not part of a
5484 // group yet, either start a new group or add it to the current one.
5485 if (TaggedFI >= 0)
5486 GB.AddMember(TaggedFI);
5487 else
5488 GB.EndCurrentGroup();
5489 }
5490 // Groups should never span multiple basic blocks.
5491 GB.EndCurrentGroup();
5492 }
5493
5494 if (AFI.hasStackHazardSlotIndex()) {
5495 FrameObjects[AFI.getStackHazardSlotIndex()].Accesses =
5496 FrameObject::AccessHazard;
5497 // If a stack object is unknown or both GPR and FPR, sort it into GPR.
5498 for (auto &Obj : FrameObjects)
5499 if (!Obj.Accesses ||
5500 Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR))
5501 Obj.Accesses = FrameObject::AccessGPR;
5502 }
5503
5504 // If the function's tagged base pointer is pinned to a stack slot, we want to
5505 // put that slot first when possible. This will likely place it at SP + 0,
5506 // and save one instruction when generating the base pointer because IRG does
5507 // not allow an immediate offset.
5508 std::optional<int> TBPI = AFI.getTaggedBasePointerIndex();
5509 if (TBPI) {
5510 FrameObjects[*TBPI].ObjectFirst = true;
5511 FrameObjects[*TBPI].GroupFirst = true;
5512 int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex;
5513 if (FirstGroupIndex >= 0)
5514 for (FrameObject &Object : FrameObjects)
5515 if (Object.GroupIndex == FirstGroupIndex)
5516 Object.GroupFirst = true;
5517 }
5518
5519 llvm::stable_sort(FrameObjects, FrameObjectCompare);
5520
5521 int i = 0;
5522 for (auto &Obj : FrameObjects) {
5523 // All invalid items are sorted at the end, so it's safe to stop.
5524 if (!Obj.IsValid)
5525 break;
5526 ObjectsToAllocate[i++] = Obj.ObjectIndex;
5527 }
5528
5529 LLVM_DEBUG({
5530 dbgs() << "Final frame order:\n";
5531 for (auto &Obj : FrameObjects) {
5532 if (!Obj.IsValid)
5533 break;
5534 dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex;
5535 if (Obj.ObjectFirst)
5536 dbgs() << ", first";
5537 if (Obj.GroupFirst)
5538 dbgs() << ", group-first";
5539 dbgs() << "\n";
5540 }
5541 });
5542 }
5543
5544 /// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
5545 /// least every ProbeSize bytes. Returns an iterator of the first instruction
5546 /// after the loop. The difference between SP and TargetReg must be an exact
5547 /// multiple of ProbeSize.
5548 MachineBasicBlock::iterator
inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,int64_t ProbeSize,Register TargetReg) const5549 AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
5550 MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
5551 Register TargetReg) const {
5552 MachineBasicBlock &MBB = *MBBI->getParent();
5553 MachineFunction &MF = *MBB.getParent();
5554 const AArch64InstrInfo *TII =
5555 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
5556 DebugLoc DL = MBB.findDebugLoc(MBBI);
5557
5558 MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
5559 MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
5560 MF.insert(MBBInsertPoint, LoopMBB);
5561 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
5562 MF.insert(MBBInsertPoint, ExitMBB);
5563
5564 // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
5565 // in SUB).
5566 emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
5567 StackOffset::getFixed(-ProbeSize), TII,
5568 MachineInstr::FrameSetup);
5569 // STR XZR, [SP]
5570 BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
5571 .addReg(AArch64::XZR)
5572 .addReg(AArch64::SP)
5573 .addImm(0)
5574 .setMIFlags(MachineInstr::FrameSetup);
5575 // CMP SP, TargetReg
5576 BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
5577 AArch64::XZR)
5578 .addReg(AArch64::SP)
5579 .addReg(TargetReg)
5580 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
5581 .setMIFlags(MachineInstr::FrameSetup);
5582 // B.CC Loop
5583 BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
5584 .addImm(AArch64CC::NE)
5585 .addMBB(LoopMBB)
5586 .setMIFlags(MachineInstr::FrameSetup);
5587
5588 LoopMBB->addSuccessor(ExitMBB);
5589 LoopMBB->addSuccessor(LoopMBB);
5590 // Synthesize the exit MBB.
5591 ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
5592 ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
5593 MBB.addSuccessor(LoopMBB);
5594 // Update liveins.
5595 fullyRecomputeLiveIns({ExitMBB, LoopMBB});
5596
5597 return ExitMBB->begin();
5598 }
5599
inlineStackProbeFixed(MachineBasicBlock::iterator MBBI,Register ScratchReg,int64_t FrameSize,StackOffset CFAOffset) const5600 void AArch64FrameLowering::inlineStackProbeFixed(
5601 MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
5602 StackOffset CFAOffset) const {
5603 MachineBasicBlock *MBB = MBBI->getParent();
5604 MachineFunction &MF = *MBB->getParent();
5605 const AArch64InstrInfo *TII =
5606 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
5607 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
5608 bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
5609 bool HasFP = hasFP(MF);
5610
5611 DebugLoc DL;
5612 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
5613 int64_t NumBlocks = FrameSize / ProbeSize;
5614 int64_t ResidualSize = FrameSize % ProbeSize;
5615
5616 LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
5617 << NumBlocks << " blocks of " << ProbeSize
5618 << " bytes, plus " << ResidualSize << " bytes\n");
5619
5620 // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
5621 // ordinary loop.
5622 if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
5623 for (int i = 0; i < NumBlocks; ++i) {
5624 // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
5625 // encodable in a SUB).
5626 emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
5627 StackOffset::getFixed(-ProbeSize), TII,
5628 MachineInstr::FrameSetup, false, false, nullptr,
5629 EmitAsyncCFI && !HasFP, CFAOffset);
5630 CFAOffset += StackOffset::getFixed(ProbeSize);
5631 // STR XZR, [SP]
5632 BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
5633 .addReg(AArch64::XZR)
5634 .addReg(AArch64::SP)
5635 .addImm(0)
5636 .setMIFlags(MachineInstr::FrameSetup);
5637 }
5638 } else if (NumBlocks != 0) {
5639 // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
5640 // encodable in ADD). ScrathReg may temporarily become the CFA register.
5641 emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
5642 StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
5643 MachineInstr::FrameSetup, false, false, nullptr,
5644 EmitAsyncCFI && !HasFP, CFAOffset);
5645 CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
5646 MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
5647 MBB = MBBI->getParent();
5648 if (EmitAsyncCFI && !HasFP) {
5649 // Set the CFA register back to SP.
5650 CFIInstBuilder(*MBB, MBBI, MachineInstr::FrameSetup)
5651 .buildDefCFARegister(AArch64::SP);
5652 }
5653 }
5654
5655 if (ResidualSize != 0) {
5656 // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
5657 // in SUB).
5658 emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
5659 StackOffset::getFixed(-ResidualSize), TII,
5660 MachineInstr::FrameSetup, false, false, nullptr,
5661 EmitAsyncCFI && !HasFP, CFAOffset);
5662 if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
5663 // STR XZR, [SP]
5664 BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
5665 .addReg(AArch64::XZR)
5666 .addReg(AArch64::SP)
5667 .addImm(0)
5668 .setMIFlags(MachineInstr::FrameSetup);
5669 }
5670 }
5671 }
5672
inlineStackProbe(MachineFunction & MF,MachineBasicBlock & MBB) const5673 void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
5674 MachineBasicBlock &MBB) const {
5675 // Get the instructions that need to be replaced. We emit at most two of
5676 // these. Remember them in order to avoid complications coming from the need
5677 // to traverse the block while potentially creating more blocks.
5678 SmallVector<MachineInstr *, 4> ToReplace;
5679 for (MachineInstr &MI : MBB)
5680 if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
5681 MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
5682 ToReplace.push_back(&MI);
5683
5684 for (MachineInstr *MI : ToReplace) {
5685 if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
5686 Register ScratchReg = MI->getOperand(0).getReg();
5687 int64_t FrameSize = MI->getOperand(1).getImm();
5688 StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(),
5689 MI->getOperand(3).getImm());
5690 inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize,
5691 CFAOffset);
5692 } else {
5693 assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
5694 "Stack probe pseudo-instruction expected");
5695 const AArch64InstrInfo *TII =
5696 MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
5697 Register TargetReg = MI->getOperand(0).getReg();
5698 (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true);
5699 }
5700 MI->eraseFromParent();
5701 }
5702 }
5703
5704 struct StackAccess {
5705 enum AccessType {
5706 NotAccessed = 0, // Stack object not accessed by load/store instructions.
5707 GPR = 1 << 0, // A general purpose register.
5708 PPR = 1 << 1, // A predicate register.
5709 FPR = 1 << 2, // A floating point/Neon/SVE register.
5710 };
5711
5712 int Idx;
5713 StackOffset Offset;
5714 int64_t Size;
5715 unsigned AccessTypes;
5716
StackAccessStackAccess5717 StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}
5718
operator <StackAccess5719 bool operator<(const StackAccess &Rhs) const {
5720 return std::make_tuple(start(), Idx) <
5721 std::make_tuple(Rhs.start(), Rhs.Idx);
5722 }
5723
isCPUStackAccess5724 bool isCPU() const {
5725 // Predicate register load and store instructions execute on the CPU.
5726 return AccessTypes & (AccessType::GPR | AccessType::PPR);
5727 }
isSMEStackAccess5728 bool isSME() const { return AccessTypes & AccessType::FPR; }
isMixedStackAccess5729 bool isMixed() const { return isCPU() && isSME(); }
5730
startStackAccess5731 int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
endStackAccess5732 int64_t end() const { return start() + Size; }
5733
getTypeStringStackAccess5734 std::string getTypeString() const {
5735 switch (AccessTypes) {
5736 case AccessType::FPR:
5737 return "FPR";
5738 case AccessType::PPR:
5739 return "PPR";
5740 case AccessType::GPR:
5741 return "GPR";
5742 case AccessType::NotAccessed:
5743 return "NA";
5744 default:
5745 return "Mixed";
5746 }
5747 }
5748
printStackAccess5749 void print(raw_ostream &OS) const {
5750 OS << getTypeString() << " stack object at [SP"
5751 << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
5752 if (Offset.getScalable())
5753 OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
5754 << " * vscale";
5755 OS << "]";
5756 }
5757 };
5758
operator <<(raw_ostream & OS,const StackAccess & SA)5759 static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
5760 SA.print(OS);
5761 return OS;
5762 }
5763
emitRemarks(const MachineFunction & MF,MachineOptimizationRemarkEmitter * ORE) const5764 void AArch64FrameLowering::emitRemarks(
5765 const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {
5766
5767 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
5768 if (AFI->getSMEFnAttrs().hasNonStreamingInterfaceAndBody())
5769 return;
5770
5771 unsigned StackHazardSize = getStackHazardSize(MF);
5772 const uint64_t HazardSize =
5773 (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;
5774
5775 if (HazardSize == 0)
5776 return;
5777
5778 const MachineFrameInfo &MFI = MF.getFrameInfo();
5779 // Bail if function has no stack objects.
5780 if (!MFI.hasStackObjects())
5781 return;
5782
5783 std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
5784
5785 size_t NumFPLdSt = 0;
5786 size_t NumNonFPLdSt = 0;
5787
5788 // Collect stack accesses via Load/Store instructions.
5789 for (const MachineBasicBlock &MBB : MF) {
5790 for (const MachineInstr &MI : MBB) {
5791 if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
5792 continue;
5793 for (MachineMemOperand *MMO : MI.memoperands()) {
5794 std::optional<int> FI = getMMOFrameID(MMO, MFI);
5795 if (FI && !MFI.isDeadObjectIndex(*FI)) {
5796 int FrameIdx = *FI;
5797
5798 size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
5799 if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
5800 StackAccesses[ArrIdx].Idx = FrameIdx;
5801 StackAccesses[ArrIdx].Offset =
5802 getFrameIndexReferenceFromSP(MF, FrameIdx);
5803 StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx);
5804 }
5805
5806 unsigned RegTy = StackAccess::AccessType::GPR;
5807 if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
5808 // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
5809 // spill/fill the predicate as a data vector (so are an FPR access).
5810 if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
5811 MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
5812 AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
5813 RegTy = StackAccess::PPR;
5814 } else
5815 RegTy = StackAccess::FPR;
5816 } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
5817 RegTy = StackAccess::FPR;
5818 }
5819
5820 StackAccesses[ArrIdx].AccessTypes |= RegTy;
5821
5822 if (RegTy == StackAccess::FPR)
5823 ++NumFPLdSt;
5824 else
5825 ++NumNonFPLdSt;
5826 }
5827 }
5828 }
5829 }
5830
5831 if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
5832 return;
5833
5834 llvm::sort(StackAccesses);
5835 llvm::erase_if(StackAccesses, [](const StackAccess &S) {
5836 return S.AccessTypes == StackAccess::NotAccessed;
5837 });
5838
5839 SmallVector<const StackAccess *> MixedObjects;
5840 SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;
5841
5842 if (StackAccesses.front().isMixed())
5843 MixedObjects.push_back(&StackAccesses.front());
5844
5845 for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end());
5846 It != End; ++It) {
5847 const auto &First = *It;
5848 const auto &Second = *(It + 1);
5849
5850 if (Second.isMixed())
5851 MixedObjects.push_back(&Second);
5852
5853 if ((First.isSME() && Second.isCPU()) ||
5854 (First.isCPU() && Second.isSME())) {
5855 uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
5856 if (Distance < HazardSize)
5857 HazardPairs.emplace_back(&First, &Second);
5858 }
5859 }
5860
5861 auto EmitRemark = [&](llvm::StringRef Str) {
5862 ORE->emit([&]() {
5863 auto R = MachineOptimizationRemarkAnalysis(
5864 "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
5865 return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str;
5866 });
5867 };
5868
5869 for (const auto &P : HazardPairs)
5870 EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str());
5871
5872 for (const auto *Obj : MixedObjects)
5873 EmitRemark(
5874 formatv("{0} accessed by both GP and FP instructions", *Obj).str());
5875 }
5876