1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind information,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineFunctionPass.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/TargetRegisterInfo.h"
37 #include "llvm/IR/DebugLoc.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCDwarf.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/DebugCounter.h"
44 #include "llvm/Support/ErrorHandling.h"
45 #include "llvm/Support/raw_ostream.h"
46 #include <cassert>
47 #include <cstdint>
48 #include <functional>
49 #include <iterator>
50 #include <limits>
51 #include <optional>
52
53 using namespace llvm;
54
55 #define DEBUG_TYPE "aarch64-ldst-opt"
56
57 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
58 STATISTIC(NumPostFolded, "Number of post-index updates folded");
59 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
60 STATISTIC(NumUnscaledPairCreated,
61 "Number of load/store from unscaled generated");
62 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
63 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
64 STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
65 "not passed the alignment check");
66 STATISTIC(NumConstOffsetFolded,
67 "Number of const offset of index address folded");
68
69 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
70 "Controls which pairs are considered for renaming");
71
72 // The LdStLimit limits how far we search for load/store pairs.
73 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
74 cl::init(20), cl::Hidden);
75
76 // The UpdateLimit limits how far we search for update instructions when we form
77 // pre-/post-index instructions.
78 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
79 cl::Hidden);
80
81 // The LdStConstLimit limits how far we search for const offset instructions
82 // when we form index address load/store instructions.
83 static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
84 cl::init(10), cl::Hidden);
85
86 // Enable register renaming to find additional store pairing opportunities.
87 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
88 cl::init(true), cl::Hidden);
89
90 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
91
92 namespace {
93
94 using LdStPairFlags = struct LdStPairFlags {
95 // If a matching instruction is found, MergeForward is set to true if the
96 // merge is to remove the first instruction and replace the second with
97 // a pair-wise insn, and false if the reverse is true.
98 bool MergeForward = false;
99
100 // SExtIdx gives the index of the result of the load pair that must be
101 // extended. The value of SExtIdx assumes that the paired load produces the
102 // value in this order: (I, returned iterator), i.e., -1 means no value has
103 // to be extended, 0 means I, and 1 means the returned iterator.
104 int SExtIdx = -1;
105
106 // If not none, RenameReg can be used to rename the result register of the
107 // first store in a pair. Currently this only works when merging stores
108 // forward.
109 std::optional<MCPhysReg> RenameReg;
110
111 LdStPairFlags() = default;
112
113 void setMergeForward(bool V = true) { MergeForward = V; }
114 bool getMergeForward() const { return MergeForward; }
115
116 void setSExtIdx(int V) { SExtIdx = V; }
117 int getSExtIdx() const { return SExtIdx; }
118
119 void setRenameReg(MCPhysReg R) { RenameReg = R; }
120 void clearRenameReg() { RenameReg = std::nullopt; }
121 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
122 };
123
124 struct AArch64LoadStoreOpt : public MachineFunctionPass {
125 static char ID;
126
AArch64LoadStoreOpt__anon22729c170111::AArch64LoadStoreOpt127 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
128
129 AliasAnalysis *AA;
130 const AArch64InstrInfo *TII;
131 const TargetRegisterInfo *TRI;
132 const AArch64Subtarget *Subtarget;
133
134 // Track which register units have been modified and used.
135 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
136 LiveRegUnits DefinedInBB;
137
getAnalysisUsage__anon22729c170111::AArch64LoadStoreOpt138 void getAnalysisUsage(AnalysisUsage &AU) const override {
139 AU.addRequired<AAResultsWrapperPass>();
140 MachineFunctionPass::getAnalysisUsage(AU);
141 }
142
143 // Scan the instructions looking for a load/store that can be combined
144 // with the current instruction into a load/store pair.
145 // Return the matching instruction if one is found, else MBB->end().
146 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
147 LdStPairFlags &Flags,
148 unsigned Limit,
149 bool FindNarrowMerge);
150
151 // Scan the instructions looking for a store that writes to the address from
152 // which the current load instruction reads. Return true if one is found.
153 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
154 MachineBasicBlock::iterator &StoreI);
155
156 // Merge the two instructions indicated into a wider narrow store instruction.
157 MachineBasicBlock::iterator
158 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
159 MachineBasicBlock::iterator MergeMI,
160 const LdStPairFlags &Flags);
161
162 // Merge the two instructions indicated into a single pair-wise instruction.
163 MachineBasicBlock::iterator
164 mergePairedInsns(MachineBasicBlock::iterator I,
165 MachineBasicBlock::iterator Paired,
166 const LdStPairFlags &Flags);
167
168 // Promote the load that reads directly from the address stored to.
169 MachineBasicBlock::iterator
170 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
171 MachineBasicBlock::iterator StoreI);
172
173 // Scan the instruction list to find a base register update that can
174 // be combined with the current instruction (a load or store) using
175 // pre or post indexed addressing with writeback. Scan forwards.
176 MachineBasicBlock::iterator
177 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
178 int UnscaledOffset, unsigned Limit);
179
180 // Scan the instruction list to find a register assigned with a const
181 // value that can be combined with the current instruction (a load or store)
182 // using base addressing with writeback. Scan backwards.
183 MachineBasicBlock::iterator
184 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
185 unsigned &Offset);
186
187 // Scan the instruction list to find a base register update that can
188 // be combined with the current instruction (a load or store) using
189 // pre or post indexed addressing with writeback. Scan backwards.
190 // `MergeEither` is set to true if the combined instruction may be placed
191 // either at the location of the load/store instruction or at the location of
192 // the update instruction.
193 MachineBasicBlock::iterator
194 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
195 bool &MergeEither);
196
197 // Find an instruction that updates the base register of the ld/st
198 // instruction.
199 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
200 unsigned BaseReg, int Offset);
201
202 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
203 unsigned IndexReg, unsigned &Offset);
204
205 // Merge a pre- or post-index base register update into a ld/st instruction.
206 std::optional<MachineBasicBlock::iterator>
207 mergeUpdateInsn(MachineBasicBlock::iterator I,
208 MachineBasicBlock::iterator Update, bool IsForward,
209 bool IsPreIdx, bool MergeEither);
210
211 MachineBasicBlock::iterator
212 mergeConstOffsetInsn(MachineBasicBlock::iterator I,
213 MachineBasicBlock::iterator Update, unsigned Offset,
214 int Scale);
215
216 // Find and merge zero store instructions.
217 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
218
219 // Find and pair ldr/str instructions.
220 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
221
222 // Find and promote load instructions which read directly from store.
223 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
224
225 // Find and merge a base register updates before or after a ld/st instruction.
226 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
227
228 // Find and merge an index ldr/st instruction into a base ld/st instruction.
229 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
230
231 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
232
233 bool runOnMachineFunction(MachineFunction &Fn) override;
234
getRequiredProperties__anon22729c170111::AArch64LoadStoreOpt235 MachineFunctionProperties getRequiredProperties() const override {
236 return MachineFunctionProperties().setNoVRegs();
237 }
238
getPassName__anon22729c170111::AArch64LoadStoreOpt239 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
240 };
241
242 char AArch64LoadStoreOpt::ID = 0;
243
244 } // end anonymous namespace
245
246 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
247 AARCH64_LOAD_STORE_OPT_NAME, false, false)
248
isNarrowStore(unsigned Opc)249 static bool isNarrowStore(unsigned Opc) {
250 switch (Opc) {
251 default:
252 return false;
253 case AArch64::STRBBui:
254 case AArch64::STURBBi:
255 case AArch64::STRHHui:
256 case AArch64::STURHHi:
257 return true;
258 }
259 }
260
261 // These instruction set memory tag and either keep memory contents unchanged or
262 // set it to zero, ignoring the address part of the source register.
isTagStore(const MachineInstr & MI)263 static bool isTagStore(const MachineInstr &MI) {
264 switch (MI.getOpcode()) {
265 default:
266 return false;
267 case AArch64::STGi:
268 case AArch64::STZGi:
269 case AArch64::ST2Gi:
270 case AArch64::STZ2Gi:
271 return true;
272 }
273 }
274
getMatchingNonSExtOpcode(unsigned Opc,bool * IsValidLdStrOpc=nullptr)275 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
276 bool *IsValidLdStrOpc = nullptr) {
277 if (IsValidLdStrOpc)
278 *IsValidLdStrOpc = true;
279 switch (Opc) {
280 default:
281 if (IsValidLdStrOpc)
282 *IsValidLdStrOpc = false;
283 return std::numeric_limits<unsigned>::max();
284 case AArch64::STRDui:
285 case AArch64::STURDi:
286 case AArch64::STRDpre:
287 case AArch64::STRQui:
288 case AArch64::STURQi:
289 case AArch64::STRQpre:
290 case AArch64::STRBBui:
291 case AArch64::STURBBi:
292 case AArch64::STRHHui:
293 case AArch64::STURHHi:
294 case AArch64::STRWui:
295 case AArch64::STRWpre:
296 case AArch64::STURWi:
297 case AArch64::STRXui:
298 case AArch64::STRXpre:
299 case AArch64::STURXi:
300 case AArch64::STR_ZXI:
301 case AArch64::LDRDui:
302 case AArch64::LDURDi:
303 case AArch64::LDRDpre:
304 case AArch64::LDRQui:
305 case AArch64::LDURQi:
306 case AArch64::LDRQpre:
307 case AArch64::LDRWui:
308 case AArch64::LDURWi:
309 case AArch64::LDRWpre:
310 case AArch64::LDRXui:
311 case AArch64::LDURXi:
312 case AArch64::LDRXpre:
313 case AArch64::STRSui:
314 case AArch64::STURSi:
315 case AArch64::STRSpre:
316 case AArch64::LDRSui:
317 case AArch64::LDURSi:
318 case AArch64::LDRSpre:
319 case AArch64::LDR_ZXI:
320 return Opc;
321 case AArch64::LDRSWui:
322 return AArch64::LDRWui;
323 case AArch64::LDURSWi:
324 return AArch64::LDURWi;
325 case AArch64::LDRSWpre:
326 return AArch64::LDRWpre;
327 }
328 }
329
getMatchingWideOpcode(unsigned Opc)330 static unsigned getMatchingWideOpcode(unsigned Opc) {
331 switch (Opc) {
332 default:
333 llvm_unreachable("Opcode has no wide equivalent!");
334 case AArch64::STRBBui:
335 return AArch64::STRHHui;
336 case AArch64::STRHHui:
337 return AArch64::STRWui;
338 case AArch64::STURBBi:
339 return AArch64::STURHHi;
340 case AArch64::STURHHi:
341 return AArch64::STURWi;
342 case AArch64::STURWi:
343 return AArch64::STURXi;
344 case AArch64::STRWui:
345 return AArch64::STRXui;
346 }
347 }
348
getMatchingPairOpcode(unsigned Opc)349 static unsigned getMatchingPairOpcode(unsigned Opc) {
350 switch (Opc) {
351 default:
352 llvm_unreachable("Opcode has no pairwise equivalent!");
353 case AArch64::STRSui:
354 case AArch64::STURSi:
355 return AArch64::STPSi;
356 case AArch64::STRSpre:
357 return AArch64::STPSpre;
358 case AArch64::STRDui:
359 case AArch64::STURDi:
360 return AArch64::STPDi;
361 case AArch64::STRDpre:
362 return AArch64::STPDpre;
363 case AArch64::STRQui:
364 case AArch64::STURQi:
365 case AArch64::STR_ZXI:
366 return AArch64::STPQi;
367 case AArch64::STRQpre:
368 return AArch64::STPQpre;
369 case AArch64::STRWui:
370 case AArch64::STURWi:
371 return AArch64::STPWi;
372 case AArch64::STRWpre:
373 return AArch64::STPWpre;
374 case AArch64::STRXui:
375 case AArch64::STURXi:
376 return AArch64::STPXi;
377 case AArch64::STRXpre:
378 return AArch64::STPXpre;
379 case AArch64::LDRSui:
380 case AArch64::LDURSi:
381 return AArch64::LDPSi;
382 case AArch64::LDRSpre:
383 return AArch64::LDPSpre;
384 case AArch64::LDRDui:
385 case AArch64::LDURDi:
386 return AArch64::LDPDi;
387 case AArch64::LDRDpre:
388 return AArch64::LDPDpre;
389 case AArch64::LDRQui:
390 case AArch64::LDURQi:
391 case AArch64::LDR_ZXI:
392 return AArch64::LDPQi;
393 case AArch64::LDRQpre:
394 return AArch64::LDPQpre;
395 case AArch64::LDRWui:
396 case AArch64::LDURWi:
397 return AArch64::LDPWi;
398 case AArch64::LDRWpre:
399 return AArch64::LDPWpre;
400 case AArch64::LDRXui:
401 case AArch64::LDURXi:
402 return AArch64::LDPXi;
403 case AArch64::LDRXpre:
404 return AArch64::LDPXpre;
405 case AArch64::LDRSWui:
406 case AArch64::LDURSWi:
407 return AArch64::LDPSWi;
408 case AArch64::LDRSWpre:
409 return AArch64::LDPSWpre;
410 }
411 }
412
isMatchingStore(MachineInstr & LoadInst,MachineInstr & StoreInst)413 static unsigned isMatchingStore(MachineInstr &LoadInst,
414 MachineInstr &StoreInst) {
415 unsigned LdOpc = LoadInst.getOpcode();
416 unsigned StOpc = StoreInst.getOpcode();
417 switch (LdOpc) {
418 default:
419 llvm_unreachable("Unsupported load instruction!");
420 case AArch64::LDRBBui:
421 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
422 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
423 case AArch64::LDURBBi:
424 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
425 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
426 case AArch64::LDRHHui:
427 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
428 StOpc == AArch64::STRXui;
429 case AArch64::LDURHHi:
430 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
431 StOpc == AArch64::STURXi;
432 case AArch64::LDRWui:
433 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
434 case AArch64::LDURWi:
435 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
436 case AArch64::LDRXui:
437 return StOpc == AArch64::STRXui;
438 case AArch64::LDURXi:
439 return StOpc == AArch64::STURXi;
440 }
441 }
442
getPreIndexedOpcode(unsigned Opc)443 static unsigned getPreIndexedOpcode(unsigned Opc) {
444 // FIXME: We don't currently support creating pre-indexed loads/stores when
445 // the load or store is the unscaled version. If we decide to perform such an
446 // optimization in the future the cases for the unscaled loads/stores will
447 // need to be added here.
448 switch (Opc) {
449 default:
450 llvm_unreachable("Opcode has no pre-indexed equivalent!");
451 case AArch64::STRSui:
452 return AArch64::STRSpre;
453 case AArch64::STRDui:
454 return AArch64::STRDpre;
455 case AArch64::STRQui:
456 return AArch64::STRQpre;
457 case AArch64::STRBBui:
458 return AArch64::STRBBpre;
459 case AArch64::STRHHui:
460 return AArch64::STRHHpre;
461 case AArch64::STRWui:
462 return AArch64::STRWpre;
463 case AArch64::STRXui:
464 return AArch64::STRXpre;
465 case AArch64::LDRSui:
466 return AArch64::LDRSpre;
467 case AArch64::LDRDui:
468 return AArch64::LDRDpre;
469 case AArch64::LDRQui:
470 return AArch64::LDRQpre;
471 case AArch64::LDRBBui:
472 return AArch64::LDRBBpre;
473 case AArch64::LDRHHui:
474 return AArch64::LDRHHpre;
475 case AArch64::LDRWui:
476 return AArch64::LDRWpre;
477 case AArch64::LDRXui:
478 return AArch64::LDRXpre;
479 case AArch64::LDRSWui:
480 return AArch64::LDRSWpre;
481 case AArch64::LDPSi:
482 return AArch64::LDPSpre;
483 case AArch64::LDPSWi:
484 return AArch64::LDPSWpre;
485 case AArch64::LDPDi:
486 return AArch64::LDPDpre;
487 case AArch64::LDPQi:
488 return AArch64::LDPQpre;
489 case AArch64::LDPWi:
490 return AArch64::LDPWpre;
491 case AArch64::LDPXi:
492 return AArch64::LDPXpre;
493 case AArch64::STPSi:
494 return AArch64::STPSpre;
495 case AArch64::STPDi:
496 return AArch64::STPDpre;
497 case AArch64::STPQi:
498 return AArch64::STPQpre;
499 case AArch64::STPWi:
500 return AArch64::STPWpre;
501 case AArch64::STPXi:
502 return AArch64::STPXpre;
503 case AArch64::STGi:
504 return AArch64::STGPreIndex;
505 case AArch64::STZGi:
506 return AArch64::STZGPreIndex;
507 case AArch64::ST2Gi:
508 return AArch64::ST2GPreIndex;
509 case AArch64::STZ2Gi:
510 return AArch64::STZ2GPreIndex;
511 case AArch64::STGPi:
512 return AArch64::STGPpre;
513 }
514 }
515
getBaseAddressOpcode(unsigned Opc)516 static unsigned getBaseAddressOpcode(unsigned Opc) {
517 // TODO: Add more index address stores.
518 switch (Opc) {
519 default:
520 llvm_unreachable("Opcode has no base address equivalent!");
521 case AArch64::LDRBroX:
522 return AArch64::LDRBui;
523 case AArch64::LDRBBroX:
524 return AArch64::LDRBBui;
525 case AArch64::LDRSBXroX:
526 return AArch64::LDRSBXui;
527 case AArch64::LDRSBWroX:
528 return AArch64::LDRSBWui;
529 case AArch64::LDRHroX:
530 return AArch64::LDRHui;
531 case AArch64::LDRHHroX:
532 return AArch64::LDRHHui;
533 case AArch64::LDRSHXroX:
534 return AArch64::LDRSHXui;
535 case AArch64::LDRSHWroX:
536 return AArch64::LDRSHWui;
537 case AArch64::LDRWroX:
538 return AArch64::LDRWui;
539 case AArch64::LDRSroX:
540 return AArch64::LDRSui;
541 case AArch64::LDRSWroX:
542 return AArch64::LDRSWui;
543 case AArch64::LDRDroX:
544 return AArch64::LDRDui;
545 case AArch64::LDRXroX:
546 return AArch64::LDRXui;
547 case AArch64::LDRQroX:
548 return AArch64::LDRQui;
549 }
550 }
551
getPostIndexedOpcode(unsigned Opc)552 static unsigned getPostIndexedOpcode(unsigned Opc) {
553 switch (Opc) {
554 default:
555 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
556 case AArch64::STRSui:
557 case AArch64::STURSi:
558 return AArch64::STRSpost;
559 case AArch64::STRDui:
560 case AArch64::STURDi:
561 return AArch64::STRDpost;
562 case AArch64::STRQui:
563 case AArch64::STURQi:
564 return AArch64::STRQpost;
565 case AArch64::STRBBui:
566 return AArch64::STRBBpost;
567 case AArch64::STRHHui:
568 return AArch64::STRHHpost;
569 case AArch64::STRWui:
570 case AArch64::STURWi:
571 return AArch64::STRWpost;
572 case AArch64::STRXui:
573 case AArch64::STURXi:
574 return AArch64::STRXpost;
575 case AArch64::LDRSui:
576 case AArch64::LDURSi:
577 return AArch64::LDRSpost;
578 case AArch64::LDRDui:
579 case AArch64::LDURDi:
580 return AArch64::LDRDpost;
581 case AArch64::LDRQui:
582 case AArch64::LDURQi:
583 return AArch64::LDRQpost;
584 case AArch64::LDRBBui:
585 return AArch64::LDRBBpost;
586 case AArch64::LDRHHui:
587 return AArch64::LDRHHpost;
588 case AArch64::LDRWui:
589 case AArch64::LDURWi:
590 return AArch64::LDRWpost;
591 case AArch64::LDRXui:
592 case AArch64::LDURXi:
593 return AArch64::LDRXpost;
594 case AArch64::LDRSWui:
595 return AArch64::LDRSWpost;
596 case AArch64::LDPSi:
597 return AArch64::LDPSpost;
598 case AArch64::LDPSWi:
599 return AArch64::LDPSWpost;
600 case AArch64::LDPDi:
601 return AArch64::LDPDpost;
602 case AArch64::LDPQi:
603 return AArch64::LDPQpost;
604 case AArch64::LDPWi:
605 return AArch64::LDPWpost;
606 case AArch64::LDPXi:
607 return AArch64::LDPXpost;
608 case AArch64::STPSi:
609 return AArch64::STPSpost;
610 case AArch64::STPDi:
611 return AArch64::STPDpost;
612 case AArch64::STPQi:
613 return AArch64::STPQpost;
614 case AArch64::STPWi:
615 return AArch64::STPWpost;
616 case AArch64::STPXi:
617 return AArch64::STPXpost;
618 case AArch64::STGi:
619 return AArch64::STGPostIndex;
620 case AArch64::STZGi:
621 return AArch64::STZGPostIndex;
622 case AArch64::ST2Gi:
623 return AArch64::ST2GPostIndex;
624 case AArch64::STZ2Gi:
625 return AArch64::STZ2GPostIndex;
626 case AArch64::STGPi:
627 return AArch64::STGPpost;
628 }
629 }
630
isPreLdStPairCandidate(MachineInstr & FirstMI,MachineInstr & MI)631 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
632
633 unsigned OpcA = FirstMI.getOpcode();
634 unsigned OpcB = MI.getOpcode();
635
636 switch (OpcA) {
637 default:
638 return false;
639 case AArch64::STRSpre:
640 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
641 case AArch64::STRDpre:
642 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
643 case AArch64::STRQpre:
644 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
645 case AArch64::STRWpre:
646 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
647 case AArch64::STRXpre:
648 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
649 case AArch64::LDRSpre:
650 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
651 case AArch64::LDRDpre:
652 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
653 case AArch64::LDRQpre:
654 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
655 case AArch64::LDRWpre:
656 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
657 case AArch64::LDRXpre:
658 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
659 case AArch64::LDRSWpre:
660 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
661 }
662 }
663
664 // Returns the scale and offset range of pre/post indexed variants of MI.
getPrePostIndexedMemOpInfo(const MachineInstr & MI,int & Scale,int & MinOffset,int & MaxOffset)665 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
666 int &MinOffset, int &MaxOffset) {
667 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
668 bool IsTagStore = isTagStore(MI);
669 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
670 // as in the "unsigned offset" variant.
671 // All other pre/post indexed ldst instructions are unscaled.
672 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
673
674 if (IsPaired) {
675 MinOffset = -64;
676 MaxOffset = 63;
677 } else {
678 MinOffset = -256;
679 MaxOffset = 255;
680 }
681 }
682
getLdStRegOp(MachineInstr & MI,unsigned PairedRegOp=0)683 static MachineOperand &getLdStRegOp(MachineInstr &MI,
684 unsigned PairedRegOp = 0) {
685 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
686 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
687 if (IsPreLdSt)
688 PairedRegOp += 1;
689 unsigned Idx =
690 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
691 return MI.getOperand(Idx);
692 }
693
isLdOffsetInRangeOfSt(MachineInstr & LoadInst,MachineInstr & StoreInst,const AArch64InstrInfo * TII)694 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
695 MachineInstr &StoreInst,
696 const AArch64InstrInfo *TII) {
697 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
698 int LoadSize = TII->getMemScale(LoadInst);
699 int StoreSize = TII->getMemScale(StoreInst);
700 int UnscaledStOffset =
701 TII->hasUnscaledLdStOffset(StoreInst)
702 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
703 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
704 int UnscaledLdOffset =
705 TII->hasUnscaledLdStOffset(LoadInst)
706 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
707 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
708 return (UnscaledStOffset <= UnscaledLdOffset) &&
709 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
710 }
711
isPromotableZeroStoreInst(MachineInstr & MI)712 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
713 unsigned Opc = MI.getOpcode();
714 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
715 isNarrowStore(Opc)) &&
716 getLdStRegOp(MI).getReg() == AArch64::WZR;
717 }
718
isPromotableLoadFromStore(MachineInstr & MI)719 static bool isPromotableLoadFromStore(MachineInstr &MI) {
720 switch (MI.getOpcode()) {
721 default:
722 return false;
723 // Scaled instructions.
724 case AArch64::LDRBBui:
725 case AArch64::LDRHHui:
726 case AArch64::LDRWui:
727 case AArch64::LDRXui:
728 // Unscaled instructions.
729 case AArch64::LDURBBi:
730 case AArch64::LDURHHi:
731 case AArch64::LDURWi:
732 case AArch64::LDURXi:
733 return true;
734 }
735 }
736
isMergeableLdStUpdate(MachineInstr & MI,AArch64FunctionInfo & AFI)737 static bool isMergeableLdStUpdate(MachineInstr &MI, AArch64FunctionInfo &AFI) {
738 unsigned Opc = MI.getOpcode();
739 switch (Opc) {
740 default:
741 return false;
742 // Scaled instructions.
743 case AArch64::STRSui:
744 case AArch64::STRDui:
745 case AArch64::STRQui:
746 case AArch64::STRXui:
747 case AArch64::STRWui:
748 case AArch64::STRHHui:
749 case AArch64::STRBBui:
750 case AArch64::LDRSui:
751 case AArch64::LDRDui:
752 case AArch64::LDRQui:
753 case AArch64::LDRXui:
754 case AArch64::LDRWui:
755 case AArch64::LDRHHui:
756 case AArch64::LDRBBui:
757 case AArch64::STGi:
758 case AArch64::STZGi:
759 case AArch64::ST2Gi:
760 case AArch64::STZ2Gi:
761 case AArch64::STGPi:
762 // Unscaled instructions.
763 case AArch64::STURSi:
764 case AArch64::STURDi:
765 case AArch64::STURQi:
766 case AArch64::STURWi:
767 case AArch64::STURXi:
768 case AArch64::LDURSi:
769 case AArch64::LDURDi:
770 case AArch64::LDURQi:
771 case AArch64::LDURWi:
772 case AArch64::LDURXi:
773 // Paired instructions.
774 case AArch64::LDPSi:
775 case AArch64::LDPSWi:
776 case AArch64::LDPDi:
777 case AArch64::LDPQi:
778 case AArch64::LDPWi:
779 case AArch64::LDPXi:
780 case AArch64::STPSi:
781 case AArch64::STPDi:
782 case AArch64::STPQi:
783 case AArch64::STPWi:
784 case AArch64::STPXi:
785 // Make sure this is a reg+imm (as opposed to an address reloc).
786 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
787 return false;
788
789 // When using stack tagging, simple sp+imm loads and stores are not
790 // tag-checked, but pre- and post-indexed versions of them are, so we can't
791 // replace the former with the latter. This transformation would be valid
792 // if the load/store accesses an untagged stack slot, but we don't have
793 // that information available after frame indices have been eliminated.
794 if (AFI.isMTETagged() &&
795 AArch64InstrInfo::getLdStBaseOp(MI).getReg() == AArch64::SP)
796 return false;
797
798 return true;
799 }
800 }
801
802 // Make sure this is a reg+reg Ld/St
isMergeableIndexLdSt(MachineInstr & MI,int & Scale)803 static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
804 unsigned Opc = MI.getOpcode();
805 switch (Opc) {
806 default:
807 return false;
808 // Scaled instructions.
809 // TODO: Add more index address stores.
810 case AArch64::LDRBroX:
811 case AArch64::LDRBBroX:
812 case AArch64::LDRSBXroX:
813 case AArch64::LDRSBWroX:
814 Scale = 1;
815 return true;
816 case AArch64::LDRHroX:
817 case AArch64::LDRHHroX:
818 case AArch64::LDRSHXroX:
819 case AArch64::LDRSHWroX:
820 Scale = 2;
821 return true;
822 case AArch64::LDRWroX:
823 case AArch64::LDRSroX:
824 case AArch64::LDRSWroX:
825 Scale = 4;
826 return true;
827 case AArch64::LDRDroX:
828 case AArch64::LDRXroX:
829 Scale = 8;
830 return true;
831 case AArch64::LDRQroX:
832 Scale = 16;
833 return true;
834 }
835 }
836
isRewritableImplicitDef(unsigned Opc)837 static bool isRewritableImplicitDef(unsigned Opc) {
838 switch (Opc) {
839 default:
840 return false;
841 case AArch64::ORRWrs:
842 case AArch64::ADDWri:
843 return true;
844 }
845 }
846
847 MachineBasicBlock::iterator
mergeNarrowZeroStores(MachineBasicBlock::iterator I,MachineBasicBlock::iterator MergeMI,const LdStPairFlags & Flags)848 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
849 MachineBasicBlock::iterator MergeMI,
850 const LdStPairFlags &Flags) {
851 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
852 "Expected promotable zero stores.");
853
854 MachineBasicBlock::iterator E = I->getParent()->end();
855 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
856 // If NextI is the second of the two instructions to be merged, we need
857 // to skip one further. Either way we merge will invalidate the iterator,
858 // and we don't need to scan the new instruction, as it's a pairwise
859 // instruction, which we're not considering for further action anyway.
860 if (NextI == MergeMI)
861 NextI = next_nodbg(NextI, E);
862
863 unsigned Opc = I->getOpcode();
864 unsigned MergeMIOpc = MergeMI->getOpcode();
865 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
866 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
867 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
868 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
869
870 bool MergeForward = Flags.getMergeForward();
871 // Insert our new paired instruction after whichever of the paired
872 // instructions MergeForward indicates.
873 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
874 // Also based on MergeForward is from where we copy the base register operand
875 // so we get the flags compatible with the input code.
876 const MachineOperand &BaseRegOp =
877 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
878 : AArch64InstrInfo::getLdStBaseOp(*I);
879
880 // Which register is Rt and which is Rt2 depends on the offset order.
881 int64_t IOffsetInBytes =
882 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
883 int64_t MIOffsetInBytes =
884 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
885 MergeMIOffsetStride;
886 // Select final offset based on the offset order.
887 int64_t OffsetImm;
888 if (IOffsetInBytes > MIOffsetInBytes)
889 OffsetImm = MIOffsetInBytes;
890 else
891 OffsetImm = IOffsetInBytes;
892
893 int NewOpcode = getMatchingWideOpcode(Opc);
894 // Adjust final offset on scaled stores because the new instruction
895 // has a different scale.
896 if (!TII->hasUnscaledLdStOffset(NewOpcode)) {
897 int NewOffsetStride = TII->getMemScale(NewOpcode);
898 assert(((OffsetImm % NewOffsetStride) == 0) &&
899 "Offset should be a multiple of the store memory scale");
900 OffsetImm = OffsetImm / NewOffsetStride;
901 }
902
903 // Construct the new instruction.
904 DebugLoc DL = I->getDebugLoc();
905 MachineBasicBlock *MBB = I->getParent();
906 MachineInstrBuilder MIB;
907 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(NewOpcode))
908 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
909 .add(BaseRegOp)
910 .addImm(OffsetImm)
911 .cloneMergedMemRefs({&*I, &*MergeMI})
912 .setMIFlags(I->mergeFlagsWith(*MergeMI));
913 (void)MIB;
914
915 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
916 LLVM_DEBUG(I->print(dbgs()));
917 LLVM_DEBUG(dbgs() << " ");
918 LLVM_DEBUG(MergeMI->print(dbgs()));
919 LLVM_DEBUG(dbgs() << " with instruction:\n ");
920 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
921 LLVM_DEBUG(dbgs() << "\n");
922
923 // Erase the old instructions.
924 I->eraseFromParent();
925 MergeMI->eraseFromParent();
926 return NextI;
927 }
928
929 // Apply Fn to all instructions between MI and the beginning of the block, until
930 // a def for DefReg is reached. Returns true, iff Fn returns true for all
931 // visited instructions. Stop after visiting Limit iterations.
forAllMIsUntilDef(MachineInstr & MI,MCPhysReg DefReg,const TargetRegisterInfo * TRI,unsigned Limit,std::function<bool (MachineInstr &,bool)> & Fn)932 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
933 const TargetRegisterInfo *TRI, unsigned Limit,
934 std::function<bool(MachineInstr &, bool)> &Fn) {
935 auto MBB = MI.getParent();
936 for (MachineInstr &I :
937 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
938 if (!Limit)
939 return false;
940 --Limit;
941
942 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
943 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
944 TRI->regsOverlap(MOP.getReg(), DefReg);
945 });
946 if (!Fn(I, isDef))
947 return false;
948 if (isDef)
949 break;
950 }
951 return true;
952 }
953
updateDefinedRegisters(MachineInstr & MI,LiveRegUnits & Units,const TargetRegisterInfo * TRI)954 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
955 const TargetRegisterInfo *TRI) {
956
957 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
958 if (MOP.isReg() && MOP.isKill())
959 Units.removeReg(MOP.getReg());
960
961 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
962 if (MOP.isReg() && !MOP.isKill())
963 Units.addReg(MOP.getReg());
964 }
965
966 /// This function will add a new entry into the debugValueSubstitutions table
967 /// when two instruction have been merged into a new one represented by \p
968 /// MergedInstr.
addDebugSubstitutionsToTable(MachineFunction * MF,unsigned InstrNumToSet,MachineInstr & OriginalInstr,MachineInstr & MergedInstr)969 static void addDebugSubstitutionsToTable(MachineFunction *MF,
970 unsigned InstrNumToSet,
971 MachineInstr &OriginalInstr,
972 MachineInstr &MergedInstr) {
973
974 // Figure out the Operand Index of the destination register of the
975 // OriginalInstr in the new MergedInstr.
976 auto Reg = OriginalInstr.getOperand(0).getReg();
977 unsigned OperandNo = 0;
978 bool RegFound = false;
979 for (const auto Op : MergedInstr.operands()) {
980 if (Op.getReg() == Reg) {
981 RegFound = true;
982 break;
983 }
984 OperandNo++;
985 }
986
987 if (RegFound)
988 MF->makeDebugValueSubstitution({OriginalInstr.peekDebugInstrNum(), 0},
989 {InstrNumToSet, OperandNo});
990 }
991
992 MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,const LdStPairFlags & Flags)993 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
994 MachineBasicBlock::iterator Paired,
995 const LdStPairFlags &Flags) {
996 MachineBasicBlock::iterator E = I->getParent()->end();
997 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
998 // If NextI is the second of the two instructions to be merged, we need
999 // to skip one further. Either way we merge will invalidate the iterator,
1000 // and we don't need to scan the new instruction, as it's a pairwise
1001 // instruction, which we're not considering for further action anyway.
1002 if (NextI == Paired)
1003 NextI = next_nodbg(NextI, E);
1004
1005 int SExtIdx = Flags.getSExtIdx();
1006 unsigned Opc =
1007 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
1008 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
1009 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
1010
1011 bool MergeForward = Flags.getMergeForward();
1012
1013 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
1014 if (RenameReg) {
1015 MCRegister RegToRename = getLdStRegOp(*I).getReg();
1016 DefinedInBB.addReg(*RenameReg);
1017
1018 // Return the sub/super register for RenameReg, matching the size of
1019 // OriginalReg.
1020 auto GetMatchingSubReg =
1021 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
1022 for (MCPhysReg SubOrSuper :
1023 TRI->sub_and_superregs_inclusive(*RenameReg)) {
1024 if (C->contains(SubOrSuper))
1025 return SubOrSuper;
1026 }
1027 llvm_unreachable("Should have found matching sub or super register!");
1028 };
1029
1030 std::function<bool(MachineInstr &, bool)> UpdateMIs =
1031 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
1032 bool IsDef) {
1033 if (IsDef) {
1034 bool SeenDef = false;
1035 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1036 MachineOperand &MOP = MI.getOperand(OpIdx);
1037 // Rename the first explicit definition and all implicit
1038 // definitions matching RegToRename.
1039 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1040 (!MergeForward || !SeenDef ||
1041 (MOP.isDef() && MOP.isImplicit())) &&
1042 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
1043 assert((MOP.isImplicit() ||
1044 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1045 "Need renamable operands");
1046 Register MatchingReg;
1047 if (const TargetRegisterClass *RC =
1048 MI.getRegClassConstraint(OpIdx, TII, TRI))
1049 MatchingReg = GetMatchingSubReg(RC);
1050 else {
1051 if (!isRewritableImplicitDef(MI.getOpcode()))
1052 continue;
1053 MatchingReg = GetMatchingSubReg(
1054 TRI->getMinimalPhysRegClass(MOP.getReg()));
1055 }
1056 MOP.setReg(MatchingReg);
1057 SeenDef = true;
1058 }
1059 }
1060 } else {
1061 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
1062 MachineOperand &MOP = MI.getOperand(OpIdx);
1063 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1064 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
1065 assert((MOP.isImplicit() ||
1066 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
1067 "Need renamable operands");
1068 Register MatchingReg;
1069 if (const TargetRegisterClass *RC =
1070 MI.getRegClassConstraint(OpIdx, TII, TRI))
1071 MatchingReg = GetMatchingSubReg(RC);
1072 else
1073 MatchingReg = GetMatchingSubReg(
1074 TRI->getMinimalPhysRegClass(MOP.getReg()));
1075 assert(MatchingReg != AArch64::NoRegister &&
1076 "Cannot find matching regs for renaming");
1077 MOP.setReg(MatchingReg);
1078 }
1079 }
1080 }
1081 LLVM_DEBUG(dbgs() << "Renamed " << MI);
1082 return true;
1083 };
1084 forAllMIsUntilDef(MergeForward ? *I : *Paired->getPrevNode(), RegToRename,
1085 TRI, UINT32_MAX, UpdateMIs);
1086
1087 #if !defined(NDEBUG)
1088 // For forward merging store:
1089 // Make sure the register used for renaming is not used between the
1090 // paired instructions. That would trash the content before the new
1091 // paired instruction.
1092 MCPhysReg RegToCheck = *RenameReg;
1093 // For backward merging load:
1094 // Make sure the register being renamed is not used between the
1095 // paired instructions. That would trash the content after the new
1096 // paired instruction.
1097 if (!MergeForward)
1098 RegToCheck = RegToRename;
1099 for (auto &MI :
1100 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
1101 MergeForward ? std::next(I) : I,
1102 MergeForward ? std::next(Paired) : Paired))
1103 assert(all_of(MI.operands(),
1104 [this, RegToCheck](const MachineOperand &MOP) {
1105 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1106 MOP.isUndef() ||
1107 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
1108 }) &&
1109 "Rename register used between paired instruction, trashing the "
1110 "content");
1111 #endif
1112 }
1113
1114 // Insert our new paired instruction after whichever of the paired
1115 // instructions MergeForward indicates.
1116 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
1117 // Also based on MergeForward is from where we copy the base register operand
1118 // so we get the flags compatible with the input code.
1119 const MachineOperand &BaseRegOp =
1120 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
1121 : AArch64InstrInfo::getLdStBaseOp(*I);
1122
1123 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
1124 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
1125 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
1126 if (IsUnscaled != PairedIsUnscaled) {
1127 // We're trying to pair instructions that differ in how they are scaled. If
1128 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
1129 // the opposite (i.e., make Paired's offset unscaled).
1130 int MemSize = TII->getMemScale(*Paired);
1131 if (PairedIsUnscaled) {
1132 // If the unscaled offset isn't a multiple of the MemSize, we can't
1133 // pair the operations together.
1134 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1135 "Offset should be a multiple of the stride!");
1136 PairedOffset /= MemSize;
1137 } else {
1138 PairedOffset *= MemSize;
1139 }
1140 }
1141
1142 // Which register is Rt and which is Rt2 depends on the offset order.
1143 // However, for pre load/stores the Rt should be the one of the pre
1144 // load/store.
1145 MachineInstr *RtMI, *Rt2MI;
1146 if (Offset == PairedOffset + OffsetStride &&
1147 !AArch64InstrInfo::isPreLdSt(*I)) {
1148 RtMI = &*Paired;
1149 Rt2MI = &*I;
1150 // Here we swapped the assumption made for SExtIdx.
1151 // I.e., we turn ldp I, Paired into ldp Paired, I.
1152 // Update the index accordingly.
1153 if (SExtIdx != -1)
1154 SExtIdx = (SExtIdx + 1) % 2;
1155 } else {
1156 RtMI = &*I;
1157 Rt2MI = &*Paired;
1158 }
1159 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
1160 // Scale the immediate offset, if necessary.
1161 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
1162 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1163 "Unscaled offset cannot be scaled.");
1164 OffsetImm /= TII->getMemScale(*RtMI);
1165 }
1166
1167 // Construct the new instruction.
1168 MachineInstrBuilder MIB;
1169 DebugLoc DL = I->getDebugLoc();
1170 MachineBasicBlock *MBB = I->getParent();
1171 MachineOperand RegOp0 = getLdStRegOp(*RtMI);
1172 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
1173 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1174 // Kill flags may become invalid when moving stores for pairing.
1175 if (RegOp0.isUse()) {
1176 if (!MergeForward) {
1177 // Clear kill flags on store if moving upwards. Example:
1178 // STRWui kill %w0, ...
1179 // USE %w1
1180 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1181 // We are about to move the store of w1, so its kill flag may become
1182 // invalid; not the case for w0.
1183 // Since w1 is used between the stores, the kill flag on w1 is cleared
1184 // after merging.
1185 // STPWi kill %w0, %w1, ...
1186 // USE %w1
1187 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)
1188 if (It->readsRegister(PairedRegOp.getReg(), TRI))
1189 PairedRegOp.setIsKill(false);
1190 } else {
1191 // Clear kill flags of the first stores register. Example:
1192 // STRWui %w1, ...
1193 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1194 // STRW %w0
1195 Register Reg = getLdStRegOp(*I).getReg();
1196 for (MachineInstr &MI : make_range(std::next(I), Paired))
1197 MI.clearRegisterKills(Reg, TRI);
1198 }
1199 }
1200
1201 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1202 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1203
1204 // Adds the pre-index operand for pre-indexed ld/st pairs.
1205 if (AArch64InstrInfo::isPreLdSt(*RtMI))
1206 MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1207
1208 MIB.add(RegOp0)
1209 .add(RegOp1)
1210 .add(BaseRegOp)
1211 .addImm(OffsetImm)
1212 .cloneMergedMemRefs({&*I, &*Paired})
1213 .setMIFlags(I->mergeFlagsWith(*Paired));
1214
1215 (void)MIB;
1216
1217 LLVM_DEBUG(
1218 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1219 LLVM_DEBUG(I->print(dbgs()));
1220 LLVM_DEBUG(dbgs() << " ");
1221 LLVM_DEBUG(Paired->print(dbgs()));
1222 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1223 if (SExtIdx != -1) {
1224 // Generate the sign extension for the proper result of the ldp.
1225 // I.e., with X1, that would be:
1226 // %w1 = KILL %w1, implicit-def %x1
1227 // %x1 = SBFMXri killed %x1, 0, 31
1228 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1229 // Right now, DstMO has the extended register, since it comes from an
1230 // extended opcode.
1231 Register DstRegX = DstMO.getReg();
1232 // Get the W variant of that register.
1233 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1234 // Update the result of LDP to use the W instead of the X variant.
1235 DstMO.setReg(DstRegW);
1236 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1237 LLVM_DEBUG(dbgs() << "\n");
1238 // Make the machine verifier happy by providing a definition for
1239 // the X register.
1240 // Insert this definition right after the generated LDP, i.e., before
1241 // InsertionPoint.
1242 MachineInstrBuilder MIBKill =
1243 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1244 .addReg(DstRegW)
1245 .addReg(DstRegX, RegState::Define);
1246 MIBKill->getOperand(2).setImplicit();
1247 // Create the sign extension.
1248 MachineInstrBuilder MIBSXTW =
1249 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1250 .addReg(DstRegX)
1251 .addImm(0)
1252 .addImm(31);
1253 (void)MIBSXTW;
1254
1255 // In the case of a sign-extend, where we have something like:
1256 // debugValueSubstitutions:[]
1257 // $w1 = LDRWui $x0, 1, debug-instr-number 1
1258 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1259 // $x0 = LDRSWui $x0, 0, debug-instr-number 2
1260 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1261
1262 // It will be converted to:
1263 // debugValueSubstitutions:[]
1264 // $w0, $w1 = LDPWi $x0, 0
1265 // $w0 = KILL $w0, implicit-def $x0
1266 // $x0 = SBFMXri $x0, 0, 31
1267 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1268 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1269
1270 // We want the final result to look like:
1271 // debugValueSubstitutions:
1272 // - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 1, subreg: 0 }
1273 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1274 // $w0, $w1 = LDPWi $x0, 0, debug-instr-number 4
1275 // $w0 = KILL $w0, implicit-def $x0
1276 // $x0 = SBFMXri $x0, 0, 31, debug-instr-number 3
1277 // DBG_INSTR_REF !7, dbg-instr-ref(1, 0), debug-location !9
1278 // DBG_INSTR_REF !8, dbg-instr-ref(2, 0), debug-location !9
1279
1280 // $x0 is where the final value is stored, so the sign extend (SBFMXri)
1281 // instruction contains the final value we care about we give it a new
1282 // debug-instr-number 3. Whereas, $w1 contains the final value that we care
1283 // about, therefore the LDP instruction is also given a new
1284 // debug-instr-number 4. We have to add these substitutions to the
1285 // debugValueSubstitutions table. However, we also have to ensure that the
1286 // OpIndex that pointed to debug-instr-number 1 gets updated to 1, because
1287 // $w1 is the second operand of the LDP instruction.
1288
1289 if (I->peekDebugInstrNum()) {
1290 // If I is the instruction which got sign extended and has a
1291 // debug-instr-number, give the SBFMXri instruction a new
1292 // debug-instr-number, and update the debugValueSubstitutions table with
1293 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1294 // instruction a new debug-instr-number, and update the
1295 // debugValueSubstitutions table with the new debug-instr-number and
1296 // OpIndex pair.
1297 unsigned NewInstrNum;
1298 if (DstRegX == I->getOperand(0).getReg()) {
1299 NewInstrNum = MIBSXTW->getDebugInstrNum();
1300 addDebugSubstitutionsToTable(MBB->getParent(), NewInstrNum, *I,
1301 *MIBSXTW);
1302 } else {
1303 NewInstrNum = MIB->getDebugInstrNum();
1304 addDebugSubstitutionsToTable(MBB->getParent(), NewInstrNum, *I, *MIB);
1305 }
1306 }
1307 if (Paired->peekDebugInstrNum()) {
1308 // If Paired is the instruction which got sign extended and has a
1309 // debug-instr-number, give the SBFMXri instruction a new
1310 // debug-instr-number, and update the debugValueSubstitutions table with
1311 // the new debug-instr-number and OpIndex pair. Otherwise, give the Merged
1312 // instruction a new debug-instr-number, and update the
1313 // debugValueSubstitutions table with the new debug-instr-number and
1314 // OpIndex pair.
1315 unsigned NewInstrNum;
1316 if (DstRegX == Paired->getOperand(0).getReg()) {
1317 NewInstrNum = MIBSXTW->getDebugInstrNum();
1318 addDebugSubstitutionsToTable(MBB->getParent(), NewInstrNum, *Paired,
1319 *MIBSXTW);
1320 } else {
1321 NewInstrNum = MIB->getDebugInstrNum();
1322 addDebugSubstitutionsToTable(MBB->getParent(), NewInstrNum, *Paired,
1323 *MIB);
1324 }
1325 }
1326
1327 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1328 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1329 } else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1330 // We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1331 // variant of the registers.
1332 MachineOperand &MOp0 = MIB->getOperand(0);
1333 MachineOperand &MOp1 = MIB->getOperand(1);
1334 assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1335 AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1336 MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1337 MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1338 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1339 } else {
1340
1341 // In the case that the merge doesn't result in a sign-extend, if we have
1342 // something like:
1343 // debugValueSubstitutions:[]
1344 // $x1 = LDRXui $x0, 1, debug-instr-number 1
1345 // DBG_INSTR_REF !13, dbg-instr-ref(1, 0), debug-location !11
1346 // $x0 = LDRXui killed $x0, 0, debug-instr-number 2
1347 // DBG_INSTR_REF !14, dbg-instr-ref(2, 0), debug-location !11
1348
1349 // It will be converted to:
1350 // debugValueSubstitutions: []
1351 // $x0, $x1 = LDPXi $x0, 0
1352 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1353 // DBG_INSTR_REF !13, dbg-instr-ref(2, 0), debug-location !14
1354
1355 // We want the final result to look like:
1356 // debugValueSubstitutions:
1357 // - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 1, subreg: 0 }
1358 // - { srcinst: 2, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 }
1359 // $x0, $x1 = LDPXi $x0, 0, debug-instr-number 3
1360 // DBG_INSTR_REF !12, dbg-instr-ref(1, 0), debug-location !14
1361 // DBG_INSTR_REF !12, dbg-instr-ref(2, 0), debug-location !14
1362
1363 // Here all that needs to be done is, that the LDP instruction needs to be
1364 // updated with a new debug-instr-number, we then need to add entries into
1365 // the debugSubstitutions table to map the old instr-refs to the new ones.
1366
1367 // Assign new DebugInstrNum to the Paired instruction.
1368 if (I->peekDebugInstrNum()) {
1369 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1370 addDebugSubstitutionsToTable(MBB->getParent(), NewDebugInstrNum, *I,
1371 *MIB);
1372 }
1373 if (Paired->peekDebugInstrNum()) {
1374 unsigned NewDebugInstrNum = MIB->getDebugInstrNum();
1375 addDebugSubstitutionsToTable(MBB->getParent(), NewDebugInstrNum, *Paired,
1376 *MIB);
1377 }
1378
1379 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1380 }
1381 LLVM_DEBUG(dbgs() << "\n");
1382
1383 if (MergeForward)
1384 for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1385 if (MOP.isReg() && MOP.isKill())
1386 DefinedInBB.addReg(MOP.getReg());
1387
1388 // Erase the old instructions.
1389 I->eraseFromParent();
1390 Paired->eraseFromParent();
1391
1392 return NextI;
1393 }
1394
1395 MachineBasicBlock::iterator
promoteLoadFromStore(MachineBasicBlock::iterator LoadI,MachineBasicBlock::iterator StoreI)1396 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1397 MachineBasicBlock::iterator StoreI) {
1398 MachineBasicBlock::iterator NextI =
1399 next_nodbg(LoadI, LoadI->getParent()->end());
1400
1401 int LoadSize = TII->getMemScale(*LoadI);
1402 int StoreSize = TII->getMemScale(*StoreI);
1403 Register LdRt = getLdStRegOp(*LoadI).getReg();
1404 const MachineOperand &StMO = getLdStRegOp(*StoreI);
1405 Register StRt = getLdStRegOp(*StoreI).getReg();
1406 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1407
1408 assert((IsStoreXReg ||
1409 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1410 "Unexpected RegClass");
1411
1412 MachineInstr *BitExtMI;
1413 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1414 // Remove the load, if the destination register of the loads is the same
1415 // register for stored value.
1416 if (StRt == LdRt && LoadSize == 8) {
1417 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1418 LoadI->getIterator())) {
1419 if (MI.killsRegister(StRt, TRI)) {
1420 MI.clearRegisterKills(StRt, TRI);
1421 break;
1422 }
1423 }
1424 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1425 LLVM_DEBUG(LoadI->print(dbgs()));
1426 LLVM_DEBUG(dbgs() << "\n");
1427 LoadI->eraseFromParent();
1428 return NextI;
1429 }
1430 // Replace the load with a mov if the load and store are in the same size.
1431 BitExtMI =
1432 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1433 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1434 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1435 .add(StMO)
1436 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1437 .setMIFlags(LoadI->getFlags());
1438 } else {
1439 // FIXME: Currently we disable this transformation in big-endian targets as
1440 // performance and correctness are verified only in little-endian.
1441 if (!Subtarget->isLittleEndian())
1442 return NextI;
1443 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1444 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1445 "Unsupported ld/st match");
1446 assert(LoadSize <= StoreSize && "Invalid load size");
1447 int UnscaledLdOffset =
1448 IsUnscaled
1449 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1450 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1451 int UnscaledStOffset =
1452 IsUnscaled
1453 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1454 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1455 int Width = LoadSize * 8;
1456 Register DestReg =
1457 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1458 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1459 : LdRt;
1460
1461 assert((UnscaledLdOffset >= UnscaledStOffset &&
1462 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1463 "Invalid offset");
1464
1465 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1466 int Imms = Immr + Width - 1;
1467 if (UnscaledLdOffset == UnscaledStOffset) {
1468 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1469 | ((Immr) << 6) // immr
1470 | ((Imms) << 0) // imms
1471 ;
1472
1473 BitExtMI =
1474 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1475 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1476 DestReg)
1477 .add(StMO)
1478 .addImm(AndMaskEncoded)
1479 .setMIFlags(LoadI->getFlags());
1480 } else if (IsStoreXReg && Imms == 31) {
1481 // Use the 32 bit variant of UBFM if it's the LSR alias of the
1482 // instruction.
1483 assert(Immr <= Imms && "Expected LSR alias of UBFM");
1484 BitExtMI = BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1485 TII->get(AArch64::UBFMWri),
1486 TRI->getSubReg(DestReg, AArch64::sub_32))
1487 .addReg(TRI->getSubReg(StRt, AArch64::sub_32))
1488 .addImm(Immr)
1489 .addImm(Imms)
1490 .setMIFlags(LoadI->getFlags());
1491 } else {
1492 BitExtMI =
1493 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1494 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1495 DestReg)
1496 .add(StMO)
1497 .addImm(Immr)
1498 .addImm(Imms)
1499 .setMIFlags(LoadI->getFlags());
1500 }
1501 }
1502
1503 // Clear kill flags between store and load.
1504 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1505 BitExtMI->getIterator()))
1506 if (MI.killsRegister(StRt, TRI)) {
1507 MI.clearRegisterKills(StRt, TRI);
1508 break;
1509 }
1510
1511 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1512 LLVM_DEBUG(StoreI->print(dbgs()));
1513 LLVM_DEBUG(dbgs() << " ");
1514 LLVM_DEBUG(LoadI->print(dbgs()));
1515 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1516 LLVM_DEBUG(StoreI->print(dbgs()));
1517 LLVM_DEBUG(dbgs() << " ");
1518 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1519 LLVM_DEBUG(dbgs() << "\n");
1520
1521 // Erase the old instructions.
1522 LoadI->eraseFromParent();
1523 return NextI;
1524 }
1525
inBoundsForPair(bool IsUnscaled,int Offset,int OffsetStride)1526 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1527 // Convert the byte-offset used by unscaled into an "element" offset used
1528 // by the scaled pair load/store instructions.
1529 if (IsUnscaled) {
1530 // If the byte-offset isn't a multiple of the stride, there's no point
1531 // trying to match it.
1532 if (Offset % OffsetStride)
1533 return false;
1534 Offset /= OffsetStride;
1535 }
1536 return Offset <= 63 && Offset >= -64;
1537 }
1538
1539 // Do alignment, specialized to power of 2 and for signed ints,
1540 // avoiding having to do a C-style cast from uint_64t to int when
1541 // using alignTo from include/llvm/Support/MathExtras.h.
1542 // FIXME: Move this function to include/MathExtras.h?
alignTo(int Num,int PowOf2)1543 static int alignTo(int Num, int PowOf2) {
1544 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1545 }
1546
mayAlias(MachineInstr & MIa,SmallVectorImpl<MachineInstr * > & MemInsns,AliasAnalysis * AA)1547 static bool mayAlias(MachineInstr &MIa,
1548 SmallVectorImpl<MachineInstr *> &MemInsns,
1549 AliasAnalysis *AA) {
1550 for (MachineInstr *MIb : MemInsns) {
1551 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
1552 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1553 return true;
1554 }
1555 }
1556
1557 LLVM_DEBUG(dbgs() << "No aliases found\n");
1558 return false;
1559 }
1560
findMatchingStore(MachineBasicBlock::iterator I,unsigned Limit,MachineBasicBlock::iterator & StoreI)1561 bool AArch64LoadStoreOpt::findMatchingStore(
1562 MachineBasicBlock::iterator I, unsigned Limit,
1563 MachineBasicBlock::iterator &StoreI) {
1564 MachineBasicBlock::iterator B = I->getParent()->begin();
1565 MachineBasicBlock::iterator MBBI = I;
1566 MachineInstr &LoadMI = *I;
1567 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1568
1569 // If the load is the first instruction in the block, there's obviously
1570 // not any matching store.
1571 if (MBBI == B)
1572 return false;
1573
1574 // Track which register units have been modified and used between the first
1575 // insn and the second insn.
1576 ModifiedRegUnits.clear();
1577 UsedRegUnits.clear();
1578
1579 unsigned Count = 0;
1580 do {
1581 MBBI = prev_nodbg(MBBI, B);
1582 MachineInstr &MI = *MBBI;
1583
1584 // Don't count transient instructions towards the search limit since there
1585 // may be different numbers of them if e.g. debug information is present.
1586 if (!MI.isTransient())
1587 ++Count;
1588
1589 // If the load instruction reads directly from the address to which the
1590 // store instruction writes and the stored value is not modified, we can
1591 // promote the load. Since we do not handle stores with pre-/post-index,
1592 // it's unnecessary to check if BaseReg is modified by the store itself.
1593 // Also we can't handle stores without an immediate offset operand,
1594 // while the operand might be the address for a global variable.
1595 if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1596 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1597 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1598 isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1599 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1600 StoreI = MBBI;
1601 return true;
1602 }
1603
1604 if (MI.isCall())
1605 return false;
1606
1607 // Update modified / uses register units.
1608 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1609
1610 // Otherwise, if the base register is modified, we have no match, so
1611 // return early.
1612 if (!ModifiedRegUnits.available(BaseReg))
1613 return false;
1614
1615 // If we encounter a store aliased with the load, return early.
1616 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1617 return false;
1618 } while (MBBI != B && Count < Limit);
1619 return false;
1620 }
1621
needsWinCFI(const MachineFunction * MF)1622 static bool needsWinCFI(const MachineFunction *MF) {
1623 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1624 MF->getFunction().needsUnwindTableEntry();
1625 }
1626
1627 // Returns true if FirstMI and MI are candidates for merging or pairing.
1628 // Otherwise, returns false.
areCandidatesToMergeOrPair(MachineInstr & FirstMI,MachineInstr & MI,LdStPairFlags & Flags,const AArch64InstrInfo * TII)1629 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1630 LdStPairFlags &Flags,
1631 const AArch64InstrInfo *TII) {
1632 // If this is volatile or if pairing is suppressed, not a candidate.
1633 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1634 return false;
1635
1636 // We should have already checked FirstMI for pair suppression and volatility.
1637 assert(!FirstMI.hasOrderedMemoryRef() &&
1638 !TII->isLdStPairSuppressed(FirstMI) &&
1639 "FirstMI shouldn't get here if either of these checks are true.");
1640
1641 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1642 MI.getFlag(MachineInstr::FrameDestroy)))
1643 return false;
1644
1645 unsigned OpcA = FirstMI.getOpcode();
1646 unsigned OpcB = MI.getOpcode();
1647
1648 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1649 if (OpcA == OpcB)
1650 return !AArch64InstrInfo::isPreLdSt(FirstMI);
1651
1652 // Bail out if one of the opcodes is SVE fill/spill, as we currently don't
1653 // allow pairing them with other instructions.
1654 if (OpcA == AArch64::LDR_ZXI || OpcA == AArch64::STR_ZXI ||
1655 OpcB == AArch64::LDR_ZXI || OpcB == AArch64::STR_ZXI)
1656 return false;
1657
1658 // Two pre ld/st of different opcodes cannot be merged either
1659 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1660 return false;
1661
1662 // Try to match a sign-extended load/store with a zero-extended load/store.
1663 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1664 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1665 assert(IsValidLdStrOpc &&
1666 "Given Opc should be a Load or Store with an immediate");
1667 // OpcA will be the first instruction in the pair.
1668 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1669 Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0);
1670 return true;
1671 }
1672
1673 // If the second instruction isn't even a mergable/pairable load/store, bail
1674 // out.
1675 if (!PairIsValidLdStrOpc)
1676 return false;
1677
1678 // Narrow stores do not have a matching pair opcodes, so constrain their
1679 // merging to zero stores.
1680 if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1681 return getLdStRegOp(FirstMI).getReg() == AArch64::WZR &&
1682 getLdStRegOp(MI).getReg() == AArch64::WZR &&
1683 TII->getMemScale(FirstMI) == TII->getMemScale(MI);
1684
1685 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1686 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1687 // are candidate pairs that can be merged.
1688 if (isPreLdStPairCandidate(FirstMI, MI))
1689 return true;
1690
1691 // Try to match an unscaled load/store with a scaled load/store.
1692 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1693 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1694
1695 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1696 }
1697
canRenameMOP(const MachineOperand & MOP,const TargetRegisterInfo * TRI)1698 static bool canRenameMOP(const MachineOperand &MOP,
1699 const TargetRegisterInfo *TRI) {
1700 if (MOP.isReg()) {
1701 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1702 // Renaming registers with multiple disjunct sub-registers (e.g. the
1703 // result of a LD3) means that all sub-registers are renamed, potentially
1704 // impacting other instructions we did not check. Bail out.
1705 // Note that this relies on the structure of the AArch64 register file. In
1706 // particular, a subregister cannot be written without overwriting the
1707 // whole register.
1708 if (RegClass->HasDisjunctSubRegs && RegClass->CoveredBySubRegs &&
1709 (TRI->getSubRegisterClass(RegClass, AArch64::dsub0) ||
1710 TRI->getSubRegisterClass(RegClass, AArch64::qsub0) ||
1711 TRI->getSubRegisterClass(RegClass, AArch64::zsub0))) {
1712 LLVM_DEBUG(
1713 dbgs()
1714 << " Cannot rename operands with multiple disjunct subregisters ("
1715 << MOP << ")\n");
1716 return false;
1717 }
1718
1719 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1720 // them must be known. For example, in ORRWrs the implicit-def
1721 // corresponds to the result register.
1722 if (MOP.isImplicit() && MOP.isDef()) {
1723 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
1724 return false;
1725 return TRI->isSuperOrSubRegisterEq(
1726 MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
1727 }
1728 }
1729 return MOP.isImplicit() ||
1730 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1731 }
1732
1733 static bool
canRenameUpToDef(MachineInstr & FirstMI,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1734 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1735 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1736 const TargetRegisterInfo *TRI) {
1737 if (!FirstMI.mayStore())
1738 return false;
1739
1740 // Check if we can find an unused register which we can use to rename
1741 // the register used by the first load/store.
1742
1743 auto RegToRename = getLdStRegOp(FirstMI).getReg();
1744 // For now, we only rename if the store operand gets killed at the store.
1745 if (!getLdStRegOp(FirstMI).isKill() &&
1746 !any_of(FirstMI.operands(),
1747 [TRI, RegToRename](const MachineOperand &MOP) {
1748 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1749 MOP.isImplicit() && MOP.isKill() &&
1750 TRI->regsOverlap(RegToRename, MOP.getReg());
1751 })) {
1752 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1753 return false;
1754 }
1755
1756 bool FoundDef = false;
1757
1758 // For each instruction between FirstMI and the previous def for RegToRename,
1759 // we
1760 // * check if we can rename RegToRename in this instruction
1761 // * collect the registers used and required register classes for RegToRename.
1762 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1763 bool IsDef) {
1764 LLVM_DEBUG(dbgs() << "Checking " << MI);
1765 // Currently we do not try to rename across frame-setup instructions.
1766 if (MI.getFlag(MachineInstr::FrameSetup)) {
1767 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1768 << "currently\n");
1769 return false;
1770 }
1771
1772 UsedInBetween.accumulate(MI);
1773
1774 // For a definition, check that we can rename the definition and exit the
1775 // loop.
1776 FoundDef = IsDef;
1777
1778 // For defs, check if we can rename the first def of RegToRename.
1779 if (FoundDef) {
1780 // For some pseudo instructions, we might not generate code in the end
1781 // (e.g. KILL) and we would end up without a correct def for the rename
1782 // register.
1783 // TODO: This might be overly conservative and we could handle those cases
1784 // in multiple ways:
1785 // 1. Insert an extra copy, to materialize the def.
1786 // 2. Skip pseudo-defs until we find an non-pseudo def.
1787 if (MI.isPseudo()) {
1788 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1789 return false;
1790 }
1791
1792 for (auto &MOP : MI.operands()) {
1793 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1794 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1795 continue;
1796 if (!canRenameMOP(MOP, TRI)) {
1797 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1798 return false;
1799 }
1800 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1801 }
1802 return true;
1803 } else {
1804 for (auto &MOP : MI.operands()) {
1805 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1806 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1807 continue;
1808
1809 if (!canRenameMOP(MOP, TRI)) {
1810 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1811 return false;
1812 }
1813 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1814 }
1815 }
1816 return true;
1817 };
1818
1819 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1820 return false;
1821
1822 if (!FoundDef) {
1823 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1824 return false;
1825 }
1826 return true;
1827 }
1828
1829 // We want to merge the second load into the first by rewriting the usages of
1830 // the same reg between first (incl.) and second (excl.). We don't need to care
1831 // about any insns before FirstLoad or after SecondLoad.
1832 // 1. The second load writes new value into the same reg.
1833 // - The renaming is impossible to impact later use of the reg.
1834 // - The second load always trash the value written by the first load which
1835 // means the reg must be killed before the second load.
1836 // 2. The first load must be a def for the same reg so we don't need to look
1837 // into anything before it.
canRenameUntilSecondLoad(MachineInstr & FirstLoad,MachineInstr & SecondLoad,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1838 static bool canRenameUntilSecondLoad(
1839 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1840 LiveRegUnits &UsedInBetween,
1841 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1842 const TargetRegisterInfo *TRI) {
1843 if (FirstLoad.isPseudo())
1844 return false;
1845
1846 UsedInBetween.accumulate(FirstLoad);
1847 auto RegToRename = getLdStRegOp(FirstLoad).getReg();
1848 bool Success = std::all_of(
1849 FirstLoad.getIterator(), SecondLoad.getIterator(),
1850 [&](MachineInstr &MI) {
1851 LLVM_DEBUG(dbgs() << "Checking " << MI);
1852 // Currently we do not try to rename across frame-setup instructions.
1853 if (MI.getFlag(MachineInstr::FrameSetup)) {
1854 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1855 << "currently\n");
1856 return false;
1857 }
1858
1859 for (auto &MOP : MI.operands()) {
1860 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1861 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1862 continue;
1863 if (!canRenameMOP(MOP, TRI)) {
1864 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1865 return false;
1866 }
1867 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1868 }
1869
1870 return true;
1871 });
1872 return Success;
1873 }
1874
1875 // Check if we can find a physical register for renaming \p Reg. This register
1876 // must:
1877 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1878 // defined registers up to the point where the renamed register will be used,
1879 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1880 // registers in the range the rename register will be used,
1881 // * is available in all used register classes (checked using RequiredClasses).
tryToFindRegisterToRename(const MachineFunction & MF,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1882 static std::optional<MCPhysReg> tryToFindRegisterToRename(
1883 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1884 LiveRegUnits &UsedInBetween,
1885 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1886 const TargetRegisterInfo *TRI) {
1887 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1888
1889 // Checks if any sub- or super-register of PR is callee saved.
1890 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1891 return any_of(TRI->sub_and_superregs_inclusive(PR),
1892 [&MF, TRI](MCPhysReg SubOrSuper) {
1893 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1894 });
1895 };
1896
1897 // Check if PR or one of its sub- or super-registers can be used for all
1898 // required register classes.
1899 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1900 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1901 return any_of(
1902 TRI->sub_and_superregs_inclusive(PR),
1903 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
1904 });
1905 };
1906
1907 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1908 for (const MCPhysReg &PR : *RegClass) {
1909 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1910 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1911 CanBeUsedForAllClasses(PR)) {
1912 DefinedInBB.addReg(PR);
1913 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1914 << "\n");
1915 return {PR};
1916 }
1917 }
1918 LLVM_DEBUG(dbgs() << "No rename register found from "
1919 << TRI->getRegClassName(RegClass) << "\n");
1920 return std::nullopt;
1921 }
1922
1923 // For store pairs: returns a register from FirstMI to the beginning of the
1924 // block that can be renamed.
1925 // For load pairs: returns a register from FirstMI to MI that can be renamed.
findRenameRegForSameLdStRegPair(std::optional<bool> MaybeCanRename,MachineInstr & FirstMI,MachineInstr & MI,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1926 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1927 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1928 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1929 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1930 const TargetRegisterInfo *TRI) {
1931 std::optional<MCPhysReg> RenameReg;
1932 if (!DebugCounter::shouldExecute(RegRenamingCounter))
1933 return RenameReg;
1934
1935 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1936 MachineFunction &MF = *FirstMI.getParent()->getParent();
1937 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1938 return RenameReg;
1939
1940 const bool IsLoad = FirstMI.mayLoad();
1941
1942 if (!MaybeCanRename) {
1943 if (IsLoad)
1944 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,
1945 RequiredClasses, TRI)};
1946 else
1947 MaybeCanRename = {
1948 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1949 }
1950
1951 if (*MaybeCanRename) {
1952 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1953 RequiredClasses, TRI);
1954 }
1955 return RenameReg;
1956 }
1957
1958 /// Scan the instructions looking for a load/store that can be combined with the
1959 /// current instruction into a wider equivalent or a load/store pair.
1960 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I,LdStPairFlags & Flags,unsigned Limit,bool FindNarrowMerge)1961 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1962 LdStPairFlags &Flags, unsigned Limit,
1963 bool FindNarrowMerge) {
1964 MachineBasicBlock::iterator E = I->getParent()->end();
1965 MachineBasicBlock::iterator MBBI = I;
1966 MachineBasicBlock::iterator MBBIWithRenameReg;
1967 MachineInstr &FirstMI = *I;
1968 MBBI = next_nodbg(MBBI, E);
1969
1970 bool MayLoad = FirstMI.mayLoad();
1971 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1972 Register Reg = getLdStRegOp(FirstMI).getReg();
1973 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1974 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1975 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1976 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1977
1978 std::optional<bool> MaybeCanRename;
1979 if (!EnableRenaming)
1980 MaybeCanRename = {false};
1981
1982 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1983 LiveRegUnits UsedInBetween;
1984 UsedInBetween.init(*TRI);
1985
1986 Flags.clearRenameReg();
1987
1988 // Track which register units have been modified and used between the first
1989 // insn (inclusive) and the second insn.
1990 ModifiedRegUnits.clear();
1991 UsedRegUnits.clear();
1992
1993 // Remember any instructions that read/write memory between FirstMI and MI.
1994 SmallVector<MachineInstr *, 4> MemInsns;
1995
1996 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1997 for (unsigned Count = 0; MBBI != E && Count < Limit;
1998 MBBI = next_nodbg(MBBI, E)) {
1999 MachineInstr &MI = *MBBI;
2000 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
2001
2002 UsedInBetween.accumulate(MI);
2003
2004 // Don't count transient instructions towards the search limit since there
2005 // may be different numbers of them if e.g. debug information is present.
2006 if (!MI.isTransient())
2007 ++Count;
2008
2009 Flags.setSExtIdx(-1);
2010 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
2011 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
2012 assert(MI.mayLoadOrStore() && "Expected memory operation.");
2013 // If we've found another instruction with the same opcode, check to see
2014 // if the base and offset are compatible with our starting instruction.
2015 // These instructions all have scaled immediate operands, so we just
2016 // check for +1/-1. Make sure to check the new instruction offset is
2017 // actually an immediate and not a symbolic reference destined for
2018 // a relocation.
2019 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
2020 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2021 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
2022 if (IsUnscaled != MIIsUnscaled) {
2023 // We're trying to pair instructions that differ in how they are scaled.
2024 // If FirstMI is scaled then scale the offset of MI accordingly.
2025 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
2026 int MemSize = TII->getMemScale(MI);
2027 if (MIIsUnscaled) {
2028 // If the unscaled offset isn't a multiple of the MemSize, we can't
2029 // pair the operations together: bail and keep looking.
2030 if (MIOffset % MemSize) {
2031 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2032 UsedRegUnits, TRI);
2033 MemInsns.push_back(&MI);
2034 continue;
2035 }
2036 MIOffset /= MemSize;
2037 } else {
2038 MIOffset *= MemSize;
2039 }
2040 }
2041
2042 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
2043
2044 if (BaseReg == MIBaseReg) {
2045 // If the offset of the second ld/st is not equal to the size of the
2046 // destination register it can’t be paired with a pre-index ld/st
2047 // pair. Additionally if the base reg is used or modified the operations
2048 // can't be paired: bail and keep looking.
2049 if (IsPreLdSt) {
2050 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
2051 bool IsBaseRegUsed = !UsedRegUnits.available(
2052 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2053 bool IsBaseRegModified = !ModifiedRegUnits.available(
2054 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2055 // If the stored value and the address of the second instruction is
2056 // the same, it needs to be using the updated register and therefore
2057 // it must not be folded.
2058 bool IsMIRegTheSame =
2059 TRI->regsOverlap(getLdStRegOp(MI).getReg(),
2060 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
2061 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
2062 IsMIRegTheSame) {
2063 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2064 UsedRegUnits, TRI);
2065 MemInsns.push_back(&MI);
2066 continue;
2067 }
2068 } else {
2069 if ((Offset != MIOffset + OffsetStride) &&
2070 (Offset + OffsetStride != MIOffset)) {
2071 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2072 UsedRegUnits, TRI);
2073 MemInsns.push_back(&MI);
2074 continue;
2075 }
2076 }
2077
2078 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
2079 if (FindNarrowMerge) {
2080 // If the alignment requirements of the scaled wide load/store
2081 // instruction can't express the offset of the scaled narrow input,
2082 // bail and keep looking. For promotable zero stores, allow only when
2083 // the stored value is the same (i.e., WZR).
2084 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
2085 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
2086 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2087 UsedRegUnits, TRI);
2088 MemInsns.push_back(&MI);
2089 continue;
2090 }
2091 } else {
2092 // Pairwise instructions have a 7-bit signed offset field. Single
2093 // insns have a 12-bit unsigned offset field. If the resultant
2094 // immediate offset of merging these instructions is out of range for
2095 // a pairwise instruction, bail and keep looking.
2096 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
2097 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2098 UsedRegUnits, TRI);
2099 MemInsns.push_back(&MI);
2100 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
2101 << "keep looking.\n");
2102 continue;
2103 }
2104 // If the alignment requirements of the paired (scaled) instruction
2105 // can't express the offset of the unscaled input, bail and keep
2106 // looking.
2107 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
2108 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2109 UsedRegUnits, TRI);
2110 MemInsns.push_back(&MI);
2111 LLVM_DEBUG(dbgs()
2112 << "Offset doesn't fit due to alignment requirements, "
2113 << "keep looking.\n");
2114 continue;
2115 }
2116 }
2117
2118 // If the BaseReg has been modified, then we cannot do the optimization.
2119 // For example, in the following pattern
2120 // ldr x1 [x2]
2121 // ldr x2 [x3]
2122 // ldr x4 [x2, #8],
2123 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
2124 if (!ModifiedRegUnits.available(BaseReg))
2125 return E;
2126
2127 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
2128 Reg, getLdStRegOp(MI).getReg());
2129
2130 // If the Rt of the second instruction (destination register of the
2131 // load) was not modified or used between the two instructions and none
2132 // of the instructions between the second and first alias with the
2133 // second, we can combine the second into the first.
2134 bool RtNotModified =
2135 ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
2136 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
2137 !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
2138
2139 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
2140 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
2141 << (RtNotModified ? "true" : "false") << "\n"
2142 << "Reg '" << getLdStRegOp(MI) << "' not used: "
2143 << (RtNotUsed ? "true" : "false") << "\n");
2144
2145 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {
2146 // For pairs loading into the same reg, try to find a renaming
2147 // opportunity to allow the renaming of Reg between FirstMI and MI
2148 // and combine MI into FirstMI; otherwise bail and keep looking.
2149 if (SameLoadReg) {
2150 std::optional<MCPhysReg> RenameReg =
2151 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
2152 Reg, DefinedInBB, UsedInBetween,
2153 RequiredClasses, TRI);
2154 if (!RenameReg) {
2155 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
2156 UsedRegUnits, TRI);
2157 MemInsns.push_back(&MI);
2158 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
2159 << "keep looking.\n");
2160 continue;
2161 }
2162 Flags.setRenameReg(*RenameReg);
2163 }
2164
2165 Flags.setMergeForward(false);
2166 if (!SameLoadReg)
2167 Flags.clearRenameReg();
2168 return MBBI;
2169 }
2170
2171 // Likewise, if the Rt of the first instruction is not modified or used
2172 // between the two instructions and none of the instructions between the
2173 // first and the second alias with the first, we can combine the first
2174 // into the second.
2175 RtNotModified = !(
2176 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
2177
2178 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
2179 << "Reg '" << getLdStRegOp(FirstMI)
2180 << "' not modified: "
2181 << (RtNotModified ? "true" : "false") << "\n");
2182
2183 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {
2184 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
2185 Flags.setMergeForward(true);
2186 Flags.clearRenameReg();
2187 return MBBI;
2188 }
2189
2190 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
2191 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
2192 RequiredClasses, TRI);
2193 if (RenameReg) {
2194 Flags.setMergeForward(true);
2195 Flags.setRenameReg(*RenameReg);
2196 MBBIWithRenameReg = MBBI;
2197 }
2198 }
2199 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
2200 << "interference in between, keep looking.\n");
2201 }
2202 }
2203
2204 if (Flags.getRenameReg())
2205 return MBBIWithRenameReg;
2206
2207 // If the instruction wasn't a matching load or store. Stop searching if we
2208 // encounter a call instruction that might modify memory.
2209 if (MI.isCall()) {
2210 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
2211 return E;
2212 }
2213
2214 // Update modified / uses register units.
2215 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2216
2217 // Otherwise, if the base register is modified, we have no match, so
2218 // return early.
2219 if (!ModifiedRegUnits.available(BaseReg)) {
2220 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
2221 return E;
2222 }
2223
2224 // Update list of instructions that read/write memory.
2225 if (MI.mayLoadOrStore())
2226 MemInsns.push_back(&MI);
2227 }
2228 return E;
2229 }
2230
2231 static MachineBasicBlock::iterator
maybeMoveCFI(MachineInstr & MI,MachineBasicBlock::iterator MaybeCFI)2232 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
2233 assert((MI.getOpcode() == AArch64::SUBXri ||
2234 MI.getOpcode() == AArch64::ADDXri) &&
2235 "Expected a register update instruction");
2236 auto End = MI.getParent()->end();
2237 if (MaybeCFI == End ||
2238 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
2239 !(MI.getFlag(MachineInstr::FrameSetup) ||
2240 MI.getFlag(MachineInstr::FrameDestroy)) ||
2241 MI.getOperand(0).getReg() != AArch64::SP)
2242 return End;
2243
2244 const MachineFunction &MF = *MI.getParent()->getParent();
2245 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
2246 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
2247 switch (CFI.getOperation()) {
2248 case MCCFIInstruction::OpDefCfa:
2249 case MCCFIInstruction::OpDefCfaOffset:
2250 return MaybeCFI;
2251 default:
2252 return End;
2253 }
2254 }
2255
mergeUpdateInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,bool IsForward,bool IsPreIdx,bool MergeEither)2256 std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
2257 MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
2258 bool IsForward, bool IsPreIdx, bool MergeEither) {
2259 assert((Update->getOpcode() == AArch64::ADDXri ||
2260 Update->getOpcode() == AArch64::SUBXri) &&
2261 "Unexpected base register update instruction to merge!");
2262 MachineBasicBlock::iterator E = I->getParent()->end();
2263 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2264
2265 // If updating the SP and the following instruction is CFA offset related CFI,
2266 // make sure the CFI follows the SP update either by merging at the location
2267 // of the update or by moving the CFI after the merged instruction. If unable
2268 // to do so, bail.
2269 MachineBasicBlock::iterator InsertPt = I;
2270 if (IsForward) {
2271 assert(IsPreIdx);
2272 if (auto CFI = maybeMoveCFI(*Update, next_nodbg(Update, E)); CFI != E) {
2273 if (MergeEither) {
2274 InsertPt = Update;
2275 } else {
2276 // Take care not to reorder CFIs.
2277 if (std::any_of(std::next(CFI), I, [](const auto &Insn) {
2278 return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
2279 }))
2280 return std::nullopt;
2281
2282 MachineBasicBlock *MBB = InsertPt->getParent();
2283 MBB->splice(std::next(InsertPt), MBB, CFI);
2284 }
2285 }
2286 }
2287
2288 // Return the instruction following the merged instruction, which is
2289 // the instruction following our unmerged load. Unless that's the add/sub
2290 // instruction we're merging, in which case it's the one after that.
2291 if (NextI == Update)
2292 NextI = next_nodbg(NextI, E);
2293
2294 int Value = Update->getOperand(2).getImm();
2295 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
2296 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
2297 if (Update->getOpcode() == AArch64::SUBXri)
2298 Value = -Value;
2299
2300 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
2301 : getPostIndexedOpcode(I->getOpcode());
2302 MachineInstrBuilder MIB;
2303 int Scale, MinOffset, MaxOffset;
2304 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
2305 if (!AArch64InstrInfo::isPairedLdSt(*I)) {
2306 // Non-paired instruction.
2307 MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
2308 TII->get(NewOpc))
2309 .add(Update->getOperand(0))
2310 .add(getLdStRegOp(*I))
2311 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2312 .addImm(Value / Scale)
2313 .setMemRefs(I->memoperands())
2314 .setMIFlags(I->mergeFlagsWith(*Update));
2315 } else {
2316 // Paired instruction.
2317 MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
2318 TII->get(NewOpc))
2319 .add(Update->getOperand(0))
2320 .add(getLdStRegOp(*I, 0))
2321 .add(getLdStRegOp(*I, 1))
2322 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2323 .addImm(Value / Scale)
2324 .setMemRefs(I->memoperands())
2325 .setMIFlags(I->mergeFlagsWith(*Update));
2326 }
2327
2328 if (IsPreIdx) {
2329 ++NumPreFolded;
2330 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2331 } else {
2332 ++NumPostFolded;
2333 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2334 }
2335 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2336 LLVM_DEBUG(I->print(dbgs()));
2337 LLVM_DEBUG(dbgs() << " ");
2338 LLVM_DEBUG(Update->print(dbgs()));
2339 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2340 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2341 LLVM_DEBUG(dbgs() << "\n");
2342
2343 // Erase the old instructions for the block.
2344 I->eraseFromParent();
2345 Update->eraseFromParent();
2346
2347 return NextI;
2348 }
2349
2350 MachineBasicBlock::iterator
mergeConstOffsetInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,unsigned Offset,int Scale)2351 AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
2352 MachineBasicBlock::iterator Update,
2353 unsigned Offset, int Scale) {
2354 assert((Update->getOpcode() == AArch64::MOVKWi) &&
2355 "Unexpected const mov instruction to merge!");
2356 MachineBasicBlock::iterator E = I->getParent()->end();
2357 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
2358 MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
2359 MachineInstr &MemMI = *I;
2360 unsigned Mask = (1 << 12) * Scale - 1;
2361 unsigned Low = Offset & Mask;
2362 unsigned High = Offset - Low;
2363 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2364 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2365 MachineInstrBuilder AddMIB, MemMIB;
2366
2367 // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2368 AddMIB =
2369 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
2370 .addDef(IndexReg)
2371 .addUse(BaseReg)
2372 .addImm(High >> 12) // shifted value
2373 .addImm(12); // shift 12
2374 (void)AddMIB;
2375 // Ld/St DestReg, IndexReg, Imm12
2376 unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
2377 MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2378 .add(getLdStRegOp(MemMI))
2379 .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
2380 .addImm(Low / Scale)
2381 .setMemRefs(I->memoperands())
2382 .setMIFlags(I->mergeFlagsWith(*Update));
2383 (void)MemMIB;
2384
2385 ++NumConstOffsetFolded;
2386 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
2387 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2388 LLVM_DEBUG(PrevI->print(dbgs()));
2389 LLVM_DEBUG(dbgs() << " ");
2390 LLVM_DEBUG(Update->print(dbgs()));
2391 LLVM_DEBUG(dbgs() << " ");
2392 LLVM_DEBUG(I->print(dbgs()));
2393 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2394 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
2395 LLVM_DEBUG(dbgs() << " ");
2396 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
2397 LLVM_DEBUG(dbgs() << "\n");
2398
2399 // Erase the old instructions for the block.
2400 I->eraseFromParent();
2401 PrevI->eraseFromParent();
2402 Update->eraseFromParent();
2403
2404 return NextI;
2405 }
2406
isMatchingUpdateInsn(MachineInstr & MemMI,MachineInstr & MI,unsigned BaseReg,int Offset)2407 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2408 MachineInstr &MI,
2409 unsigned BaseReg, int Offset) {
2410 switch (MI.getOpcode()) {
2411 default:
2412 break;
2413 case AArch64::SUBXri:
2414 case AArch64::ADDXri:
2415 // Make sure it's a vanilla immediate operand, not a relocation or
2416 // anything else we can't handle.
2417 if (!MI.getOperand(2).isImm())
2418 break;
2419 // Watch out for 1 << 12 shifted value.
2420 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
2421 break;
2422
2423 // The update instruction source and destination register must be the
2424 // same as the load/store base register.
2425 if (MI.getOperand(0).getReg() != BaseReg ||
2426 MI.getOperand(1).getReg() != BaseReg)
2427 break;
2428
2429 int UpdateOffset = MI.getOperand(2).getImm();
2430 if (MI.getOpcode() == AArch64::SUBXri)
2431 UpdateOffset = -UpdateOffset;
2432
2433 // The immediate must be a multiple of the scaling factor of the pre/post
2434 // indexed instruction.
2435 int Scale, MinOffset, MaxOffset;
2436 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
2437 if (UpdateOffset % Scale != 0)
2438 break;
2439
2440 // Scaled offset must fit in the instruction immediate.
2441 int ScaledOffset = UpdateOffset / Scale;
2442 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2443 break;
2444
2445 // If we have a non-zero Offset, we check that it matches the amount
2446 // we're adding to the register.
2447 if (!Offset || Offset == UpdateOffset)
2448 return true;
2449 break;
2450 }
2451 return false;
2452 }
2453
isMatchingMovConstInsn(MachineInstr & MemMI,MachineInstr & MI,unsigned IndexReg,unsigned & Offset)2454 bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
2455 MachineInstr &MI,
2456 unsigned IndexReg,
2457 unsigned &Offset) {
2458 // The update instruction source and destination register must be the
2459 // same as the load/store index register.
2460 if (MI.getOpcode() == AArch64::MOVKWi &&
2461 TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
2462
2463 // movz + movk hold a large offset of a Ld/St instruction.
2464 MachineBasicBlock::iterator B = MI.getParent()->begin();
2465 MachineBasicBlock::iterator MBBI = &MI;
2466 // Skip the scene when the MI is the first instruction of a block.
2467 if (MBBI == B)
2468 return false;
2469 MBBI = prev_nodbg(MBBI, B);
2470 MachineInstr &MovzMI = *MBBI;
2471 // Make sure the MOVKWi and MOVZWi set the same register.
2472 if (MovzMI.getOpcode() == AArch64::MOVZWi &&
2473 MovzMI.getOperand(0).getReg() == MI.getOperand(0).getReg()) {
2474 unsigned Low = MovzMI.getOperand(1).getImm();
2475 unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
2476 Offset = High + Low;
2477 // 12-bit optionally shifted immediates are legal for adds.
2478 return Offset >> 24 == 0;
2479 }
2480 }
2481 return false;
2482 }
2483
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,int UnscaledOffset,unsigned Limit)2484 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2485 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2486 MachineBasicBlock::iterator E = I->getParent()->end();
2487 MachineInstr &MemMI = *I;
2488 MachineBasicBlock::iterator MBBI = I;
2489
2490 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2491 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
2492 TII->getMemScale(MemMI);
2493
2494 // Scan forward looking for post-index opportunities. Updating instructions
2495 // can't be formed if the memory instruction doesn't have the offset we're
2496 // looking for.
2497 if (MIUnscaledOffset != UnscaledOffset)
2498 return E;
2499
2500 // If the base register overlaps a source/destination register, we can't
2501 // merge the update. This does not apply to tag store instructions which
2502 // ignore the address part of the source register.
2503 // This does not apply to STGPi as well, which does not have unpredictable
2504 // behavior in this case unlike normal stores, and always performs writeback
2505 // after reading the source register value.
2506 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2507 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2508 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2509 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2510 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2511 return E;
2512 }
2513 }
2514
2515 // Track which register units have been modified and used between the first
2516 // insn (inclusive) and the second insn.
2517 ModifiedRegUnits.clear();
2518 UsedRegUnits.clear();
2519 MBBI = next_nodbg(MBBI, E);
2520
2521 // We can't post-increment the stack pointer if any instruction between
2522 // the memory access (I) and the increment (MBBI) can access the memory
2523 // region defined by [SP, MBBI].
2524 const bool BaseRegSP = BaseReg == AArch64::SP;
2525 if (BaseRegSP && needsWinCFI(I->getMF())) {
2526 // FIXME: For now, we always block the optimization over SP in windows
2527 // targets as it requires to adjust the unwind/debug info, messing up
2528 // the unwind info can actually cause a miscompile.
2529 return E;
2530 }
2531
2532 unsigned Count = 0;
2533 MachineBasicBlock *CurMBB = I->getParent();
2534 // choice of next block to visit is liveins-based
2535 bool VisitSucc = CurMBB->getParent()->getRegInfo().tracksLiveness();
2536
2537 while (true) {
2538 for (MachineBasicBlock::iterator CurEnd = CurMBB->end();
2539 MBBI != CurEnd && Count < Limit; MBBI = next_nodbg(MBBI, CurEnd)) {
2540 MachineInstr &MI = *MBBI;
2541
2542 // Don't count transient instructions towards the search limit since there
2543 // may be different numbers of them if e.g. debug information is present.
2544 if (!MI.isTransient())
2545 ++Count;
2546
2547 // If we found a match, return it.
2548 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
2549 return MBBI;
2550
2551 // Update the status of what the instruction clobbered and used.
2552 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits,
2553 TRI);
2554
2555 // Otherwise, if the base register is used or modified, we have no match,
2556 // so return early. If we are optimizing SP, do not allow instructions
2557 // that may load or store in between the load and the optimized value
2558 // update.
2559 if (!ModifiedRegUnits.available(BaseReg) ||
2560 !UsedRegUnits.available(BaseReg) ||
2561 (BaseRegSP && MBBI->mayLoadOrStore()))
2562 return E;
2563 }
2564
2565 if (!VisitSucc || Limit <= Count)
2566 break;
2567
2568 // Try to go downward to successors along a CF path w/o side enters
2569 // such that BaseReg is alive along it but not at its exits
2570 MachineBasicBlock *SuccToVisit = nullptr;
2571 unsigned LiveSuccCount = 0;
2572 for (MachineBasicBlock *Succ : CurMBB->successors()) {
2573 for (MCRegAliasIterator AI(BaseReg, TRI, true); AI.isValid(); ++AI) {
2574 if (Succ->isLiveIn(*AI)) {
2575 if (LiveSuccCount++)
2576 return E;
2577 if (Succ->pred_size() == 1)
2578 SuccToVisit = Succ;
2579 break;
2580 }
2581 }
2582 }
2583 if (!SuccToVisit)
2584 break;
2585 CurMBB = SuccToVisit;
2586 MBBI = CurMBB->begin();
2587 }
2588
2589 return E;
2590 }
2591
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,unsigned Limit,bool & MergeEither)2592 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2593 MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
2594 MachineBasicBlock::iterator B = I->getParent()->begin();
2595 MachineBasicBlock::iterator E = I->getParent()->end();
2596 MachineInstr &MemMI = *I;
2597 MachineBasicBlock::iterator MBBI = I;
2598 MachineFunction &MF = *MemMI.getMF();
2599
2600 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2601 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2602
2603 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2604 Register DestReg[] = {getLdStRegOp(MemMI, 0).getReg(),
2605 IsPairedInsn ? getLdStRegOp(MemMI, 1).getReg()
2606 : AArch64::NoRegister};
2607
2608 // If the load/store is the first instruction in the block, there's obviously
2609 // not any matching update. Ditto if the memory offset isn't zero.
2610 if (MBBI == B || Offset != 0)
2611 return E;
2612 // If the base register overlaps a destination register, we can't
2613 // merge the update.
2614 if (!isTagStore(MemMI)) {
2615 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
2616 if (DestReg[i] == BaseReg || TRI->isSubRegister(BaseReg, DestReg[i]))
2617 return E;
2618 }
2619
2620 const bool BaseRegSP = BaseReg == AArch64::SP;
2621 if (BaseRegSP && needsWinCFI(I->getMF())) {
2622 // FIXME: For now, we always block the optimization over SP in windows
2623 // targets as it requires to adjust the unwind/debug info, messing up
2624 // the unwind info can actually cause a miscompile.
2625 return E;
2626 }
2627
2628 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2629 unsigned RedZoneSize =
2630 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2631
2632 // Track which register units have been modified and used between the first
2633 // insn (inclusive) and the second insn.
2634 ModifiedRegUnits.clear();
2635 UsedRegUnits.clear();
2636 unsigned Count = 0;
2637 bool MemAccessBeforeSPPreInc = false;
2638 MergeEither = true;
2639 do {
2640 MBBI = prev_nodbg(MBBI, B);
2641 MachineInstr &MI = *MBBI;
2642
2643 // Don't count transient instructions towards the search limit since there
2644 // may be different numbers of them if e.g. debug information is present.
2645 if (!MI.isTransient())
2646 ++Count;
2647
2648 // If we found a match, return it.
2649 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2650 // Check that the update value is within our red zone limit (which may be
2651 // zero).
2652 if (MemAccessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2653 return E;
2654 return MBBI;
2655 }
2656
2657 // Update the status of what the instruction clobbered and used.
2658 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2659
2660 // Otherwise, if the base register is used or modified, we have no match, so
2661 // return early.
2662 if (!ModifiedRegUnits.available(BaseReg) ||
2663 !UsedRegUnits.available(BaseReg))
2664 return E;
2665
2666 // If we have a destination register (i.e. a load instruction) and a
2667 // destination register is used or modified, then we can only merge forward,
2668 // i.e. the combined instruction is put in the place of the memory
2669 // instruction. Same applies if we see a memory access or side effects.
2670 if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
2671 (DestReg[0] != AArch64::NoRegister &&
2672 !(ModifiedRegUnits.available(DestReg[0]) &&
2673 UsedRegUnits.available(DestReg[0]))) ||
2674 (DestReg[1] != AArch64::NoRegister &&
2675 !(ModifiedRegUnits.available(DestReg[1]) &&
2676 UsedRegUnits.available(DestReg[1]))))
2677 MergeEither = false;
2678
2679 // Keep track if we have a memory access before an SP pre-increment, in this
2680 // case we need to validate later that the update amount respects the red
2681 // zone.
2682 if (BaseRegSP && MBBI->mayLoadOrStore())
2683 MemAccessBeforeSPPreInc = true;
2684 } while (MBBI != B && Count < Limit);
2685 return E;
2686 }
2687
2688 MachineBasicBlock::iterator
findMatchingConstOffsetBackward(MachineBasicBlock::iterator I,unsigned Limit,unsigned & Offset)2689 AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
2690 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2691 MachineBasicBlock::iterator B = I->getParent()->begin();
2692 MachineBasicBlock::iterator E = I->getParent()->end();
2693 MachineInstr &MemMI = *I;
2694 MachineBasicBlock::iterator MBBI = I;
2695
2696 // If the load is the first instruction in the block, there's obviously
2697 // not any matching load or store.
2698 if (MBBI == B)
2699 return E;
2700
2701 // Make sure the IndexReg is killed and the shift amount is zero.
2702 // TODO: Relex this restriction to extend, simplify processing now.
2703 if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
2704 !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
2705 (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
2706 return E;
2707
2708 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
2709
2710 // Track which register units have been modified and used between the first
2711 // insn (inclusive) and the second insn.
2712 ModifiedRegUnits.clear();
2713 UsedRegUnits.clear();
2714 unsigned Count = 0;
2715 do {
2716 MBBI = prev_nodbg(MBBI, B);
2717 MachineInstr &MI = *MBBI;
2718
2719 // Don't count transient instructions towards the search limit since there
2720 // may be different numbers of them if e.g. debug information is present.
2721 if (!MI.isTransient())
2722 ++Count;
2723
2724 // If we found a match, return it.
2725 if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
2726 return MBBI;
2727 }
2728
2729 // Update the status of what the instruction clobbered and used.
2730 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2731
2732 // Otherwise, if the index register is used or modified, we have no match,
2733 // so return early.
2734 if (!ModifiedRegUnits.available(IndexReg) ||
2735 !UsedRegUnits.available(IndexReg))
2736 return E;
2737
2738 } while (MBBI != B && Count < Limit);
2739 return E;
2740 }
2741
tryToPromoteLoadFromStore(MachineBasicBlock::iterator & MBBI)2742 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2743 MachineBasicBlock::iterator &MBBI) {
2744 MachineInstr &MI = *MBBI;
2745 // If this is a volatile load, don't mess with it.
2746 if (MI.hasOrderedMemoryRef())
2747 return false;
2748
2749 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2750 return false;
2751
2752 // Make sure this is a reg+imm.
2753 // FIXME: It is possible to extend it to handle reg+reg cases.
2754 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2755 return false;
2756
2757 // Look backward up to LdStLimit instructions.
2758 MachineBasicBlock::iterator StoreI;
2759 if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2760 ++NumLoadsFromStoresPromoted;
2761 // Promote the load. Keeping the iterator straight is a
2762 // pain, so we let the merge routine tell us what the next instruction
2763 // is after it's done mucking about.
2764 MBBI = promoteLoadFromStore(MBBI, StoreI);
2765 return true;
2766 }
2767 return false;
2768 }
2769
2770 // Merge adjacent zero stores into a wider store.
tryToMergeZeroStInst(MachineBasicBlock::iterator & MBBI)2771 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2772 MachineBasicBlock::iterator &MBBI) {
2773 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2774 MachineInstr &MI = *MBBI;
2775 MachineBasicBlock::iterator E = MI.getParent()->end();
2776
2777 if (!TII->isCandidateToMergeOrPair(MI))
2778 return false;
2779
2780 // Look ahead up to LdStLimit instructions for a mergeable instruction.
2781 LdStPairFlags Flags;
2782 MachineBasicBlock::iterator MergeMI =
2783 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2784 if (MergeMI != E) {
2785 ++NumZeroStoresPromoted;
2786
2787 // Keeping the iterator straight is a pain, so we let the merge routine tell
2788 // us what the next instruction is after it's done mucking about.
2789 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2790 return true;
2791 }
2792 return false;
2793 }
2794
2795 // Find loads and stores that can be merged into a single load or store pair
2796 // instruction.
tryToPairLdStInst(MachineBasicBlock::iterator & MBBI)2797 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2798 MachineInstr &MI = *MBBI;
2799 MachineBasicBlock::iterator E = MI.getParent()->end();
2800
2801 if (!TII->isCandidateToMergeOrPair(MI))
2802 return false;
2803
2804 // If disable-ldp feature is opted, do not emit ldp.
2805 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2806 return false;
2807
2808 // If disable-stp feature is opted, do not emit stp.
2809 if (MI.mayStore() && Subtarget->hasDisableStp())
2810 return false;
2811
2812 // Early exit if the offset is not possible to match. (6 bits of positive
2813 // range, plus allow an extra one in case we find a later insn that matches
2814 // with Offset-1)
2815 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2816 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2817 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2818 // Allow one more for offset.
2819 if (Offset > 0)
2820 Offset -= OffsetStride;
2821 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2822 return false;
2823
2824 // Look ahead up to LdStLimit instructions for a pairable instruction.
2825 LdStPairFlags Flags;
2826 MachineBasicBlock::iterator Paired =
2827 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2828 if (Paired != E) {
2829 // Keeping the iterator straight is a pain, so we let the merge routine tell
2830 // us what the next instruction is after it's done mucking about.
2831 auto Prev = std::prev(MBBI);
2832
2833 // Fetch the memoperand of the load/store that is a candidate for
2834 // combination.
2835 MachineMemOperand *MemOp =
2836 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2837
2838 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2839 // that the alignment of the source pointer is at least double the alignment
2840 // of the type.
2841 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2842 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2843 // If there is no size/align information, cancel the transformation.
2844 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2845 NumFailedAlignmentCheck++;
2846 return false;
2847 }
2848
2849 // Get the needed alignments to check them if
2850 // ldp-aligned-only/stp-aligned-only features are opted.
2851 uint64_t MemAlignment = MemOp->getAlign().value();
2852 uint64_t TypeAlignment =
2853 Align(MemOp->getSize().getValue().getKnownMinValue()).value();
2854
2855 if (MemAlignment < 2 * TypeAlignment) {
2856 NumFailedAlignmentCheck++;
2857 return false;
2858 }
2859 }
2860
2861 ++NumPairCreated;
2862 if (TII->hasUnscaledLdStOffset(MI))
2863 ++NumUnscaledPairCreated;
2864
2865 MBBI = mergePairedInsns(MBBI, Paired, Flags);
2866 // Collect liveness info for instructions between Prev and the new position
2867 // MBBI.
2868 for (auto I = std::next(Prev); I != MBBI; I++)
2869 updateDefinedRegisters(*I, DefinedInBB, TRI);
2870
2871 return true;
2872 }
2873 return false;
2874 }
2875
tryToMergeLdStUpdate(MachineBasicBlock::iterator & MBBI)2876 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2877 (MachineBasicBlock::iterator &MBBI) {
2878 MachineInstr &MI = *MBBI;
2879 MachineBasicBlock::iterator E = MI.getParent()->end();
2880 MachineBasicBlock::iterator Update;
2881
2882 // Look forward to try to form a post-index instruction. For example,
2883 // ldr x0, [x20]
2884 // add x20, x20, #32
2885 // merged into:
2886 // ldr x0, [x20], #32
2887 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2888 if (Update != E) {
2889 // Merge the update into the ld/st.
2890 if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
2891 /*IsPreIdx=*/false,
2892 /*MergeEither=*/false)) {
2893 MBBI = *NextI;
2894 return true;
2895 }
2896 }
2897
2898 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2899 if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2900 return false;
2901
2902 // Look back to try to find a pre-index instruction. For example,
2903 // add x0, x0, #8
2904 // ldr x1, [x0]
2905 // merged into:
2906 // ldr x1, [x0, #8]!
2907 bool MergeEither;
2908 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit, MergeEither);
2909 if (Update != E) {
2910 // Merge the update into the ld/st.
2911 if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/true,
2912 /*IsPreIdx=*/true, MergeEither)) {
2913 MBBI = *NextI;
2914 return true;
2915 }
2916 }
2917
2918 // The immediate in the load/store is scaled by the size of the memory
2919 // operation. The immediate in the add we're looking for,
2920 // however, is not, so adjust here.
2921 int UnscaledOffset =
2922 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2923
2924 // Look forward to try to find a pre-index instruction. For example,
2925 // ldr x1, [x0, #64]
2926 // add x0, x0, #64
2927 // merged into:
2928 // ldr x1, [x0, #64]!
2929 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2930 if (Update != E) {
2931 // Merge the update into the ld/st.
2932 if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
2933 /*IsPreIdx=*/true,
2934 /*MergeEither=*/false)) {
2935 MBBI = *NextI;
2936 return true;
2937 }
2938 }
2939
2940 return false;
2941 }
2942
tryToMergeIndexLdSt(MachineBasicBlock::iterator & MBBI,int Scale)2943 bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
2944 int Scale) {
2945 MachineInstr &MI = *MBBI;
2946 MachineBasicBlock::iterator E = MI.getParent()->end();
2947 MachineBasicBlock::iterator Update;
2948
2949 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2950 if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2951 return false;
2952
2953 // Look back to try to find a const offset for index LdSt instruction. For
2954 // example,
2955 // mov x8, #LargeImm ; = a * (1<<12) + imm12
2956 // ldr x1, [x0, x8]
2957 // merged into:
2958 // add x8, x0, a * (1<<12)
2959 // ldr x1, [x8, imm12]
2960 unsigned Offset;
2961 Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
2962 if (Update != E && (Offset & (Scale - 1)) == 0) {
2963 // Merge the imm12 into the ld/st.
2964 MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
2965 return true;
2966 }
2967
2968 return false;
2969 }
2970
optimizeBlock(MachineBasicBlock & MBB,bool EnableNarrowZeroStOpt)2971 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2972 bool EnableNarrowZeroStOpt) {
2973 AArch64FunctionInfo &AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
2974
2975 bool Modified = false;
2976 // Four transformations to do here:
2977 // 1) Find loads that directly read from stores and promote them by
2978 // replacing with mov instructions. If the store is wider than the load,
2979 // the load will be replaced with a bitfield extract.
2980 // e.g.,
2981 // str w1, [x0, #4]
2982 // ldrh w2, [x0, #6]
2983 // ; becomes
2984 // str w1, [x0, #4]
2985 // lsr w2, w1, #16
2986 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2987 MBBI != E;) {
2988 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2989 Modified = true;
2990 else
2991 ++MBBI;
2992 }
2993 // 2) Merge adjacent zero stores into a wider store.
2994 // e.g.,
2995 // strh wzr, [x0]
2996 // strh wzr, [x0, #2]
2997 // ; becomes
2998 // str wzr, [x0]
2999 // e.g.,
3000 // str wzr, [x0]
3001 // str wzr, [x0, #4]
3002 // ; becomes
3003 // str xzr, [x0]
3004 if (EnableNarrowZeroStOpt)
3005 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3006 MBBI != E;) {
3007 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
3008 Modified = true;
3009 else
3010 ++MBBI;
3011 }
3012 // 3) Find loads and stores that can be merged into a single load or store
3013 // pair instruction.
3014 // When compiling for SVE 128, also try to combine SVE fill/spill
3015 // instructions into LDP/STP.
3016 // e.g.,
3017 // ldr x0, [x2]
3018 // ldr x1, [x2, #8]
3019 // ; becomes
3020 // ldp x0, x1, [x2]
3021 // e.g.,
3022 // ldr z0, [x2]
3023 // ldr z1, [x2, #1, mul vl]
3024 // ; becomes
3025 // ldp q0, q1, [x2]
3026
3027 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
3028 DefinedInBB.clear();
3029 DefinedInBB.addLiveIns(MBB);
3030 }
3031
3032 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3033 MBBI != E;) {
3034 // Track currently live registers up to this point, to help with
3035 // searching for a rename register on demand.
3036 updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
3037 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
3038 Modified = true;
3039 else
3040 ++MBBI;
3041 }
3042 // 4) Find base register updates that can be merged into the load or store
3043 // as a base-reg writeback.
3044 // e.g.,
3045 // ldr x0, [x2]
3046 // add x2, x2, #4
3047 // ; becomes
3048 // ldr x0, [x2], #4
3049 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3050 MBBI != E;) {
3051 if (isMergeableLdStUpdate(*MBBI, AFI) && tryToMergeLdStUpdate(MBBI))
3052 Modified = true;
3053 else
3054 ++MBBI;
3055 }
3056
3057 // 5) Find a register assigned with a const value that can be combined with
3058 // into the load or store. e.g.,
3059 // mov x8, #LargeImm ; = a * (1<<12) + imm12
3060 // ldr x1, [x0, x8]
3061 // ; becomes
3062 // add x8, x0, a * (1<<12)
3063 // ldr x1, [x8, imm12]
3064 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
3065 MBBI != E;) {
3066 int Scale;
3067 if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
3068 Modified = true;
3069 else
3070 ++MBBI;
3071 }
3072
3073 return Modified;
3074 }
3075
runOnMachineFunction(MachineFunction & Fn)3076 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
3077 if (skipFunction(Fn.getFunction()))
3078 return false;
3079
3080 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
3081 TII = Subtarget->getInstrInfo();
3082 TRI = Subtarget->getRegisterInfo();
3083 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
3084
3085 // Resize the modified and used register unit trackers. We do this once
3086 // per function and then clear the register units each time we optimize a load
3087 // or store.
3088 ModifiedRegUnits.init(*TRI);
3089 UsedRegUnits.init(*TRI);
3090 DefinedInBB.init(*TRI);
3091
3092 bool Modified = false;
3093 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
3094 for (auto &MBB : Fn) {
3095 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
3096 Modified |= M;
3097 }
3098
3099 return Modified;
3100 }
3101
3102 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
3103 // stores near one another? Note: The pre-RA instruction scheduler already has
3104 // hooks to try and schedule pairable loads/stores together to improve pairing
3105 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
3106
3107 // FIXME: When pairing store instructions it's very possible for this pass to
3108 // hoist a store with a KILL marker above another use (without a KILL marker).
3109 // The resulting IR is invalid, but nothing uses the KILL markers after this
3110 // pass, so it's never caused a problem in practice.
3111
3112 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
3113 /// load / store optimization pass.
createAArch64LoadStoreOptimizationPass()3114 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
3115 return new AArch64LoadStoreOpt();
3116 }
3117