1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains a pass that performs load / store related peephole
10 // optimizations. This pass should be run after register allocation.
11 //
12 // The pass runs after the PrologEpilogInserter where we emit the CFI
13 // instructions. In order to preserve the correctness of the unwind informaiton,
14 // the pass should not change the order of any two instructions, one of which
15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix
16 // to unwind information.
17 //
18 //===----------------------------------------------------------------------===//
19
20 #include "AArch64InstrInfo.h"
21 #include "AArch64MachineFunctionInfo.h"
22 #include "AArch64Subtarget.h"
23 #include "MCTargetDesc/AArch64AddressingModes.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/Statistic.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineFunctionPass.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineOperand.h"
35 #include "llvm/CodeGen/MachineRegisterInfo.h"
36 #include "llvm/CodeGen/TargetRegisterInfo.h"
37 #include "llvm/IR/DebugLoc.h"
38 #include "llvm/MC/MCAsmInfo.h"
39 #include "llvm/MC/MCDwarf.h"
40 #include "llvm/MC/MCRegisterInfo.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/CommandLine.h"
43 #include "llvm/Support/Debug.h"
44 #include "llvm/Support/DebugCounter.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include <cassert>
48 #include <cstdint>
49 #include <functional>
50 #include <iterator>
51 #include <limits>
52 #include <optional>
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "aarch64-ldst-opt"
57
58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
59 STATISTIC(NumPostFolded, "Number of post-index updates folded");
60 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
61 STATISTIC(NumUnscaledPairCreated,
62 "Number of load/store from unscaled generated");
63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
65 STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation "
66 "not passed the alignment check");
67
68 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
69 "Controls which pairs are considered for renaming");
70
71 // The LdStLimit limits how far we search for load/store pairs.
72 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
73 cl::init(20), cl::Hidden);
74
75 // The UpdateLimit limits how far we search for update instructions when we form
76 // pre-/post-index instructions.
77 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
78 cl::Hidden);
79
80 // Enable register renaming to find additional store pairing opportunities.
81 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
82 cl::init(true), cl::Hidden);
83
84 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
85
86 namespace {
87
88 using LdStPairFlags = struct LdStPairFlags {
89 // If a matching instruction is found, MergeForward is set to true if the
90 // merge is to remove the first instruction and replace the second with
91 // a pair-wise insn, and false if the reverse is true.
92 bool MergeForward = false;
93
94 // SExtIdx gives the index of the result of the load pair that must be
95 // extended. The value of SExtIdx assumes that the paired load produces the
96 // value in this order: (I, returned iterator), i.e., -1 means no value has
97 // to be extended, 0 means I, and 1 means the returned iterator.
98 int SExtIdx = -1;
99
100 // If not none, RenameReg can be used to rename the result register of the
101 // first store in a pair. Currently this only works when merging stores
102 // forward.
103 std::optional<MCPhysReg> RenameReg;
104
105 LdStPairFlags() = default;
106
107 void setMergeForward(bool V = true) { MergeForward = V; }
108 bool getMergeForward() const { return MergeForward; }
109
110 void setSExtIdx(int V) { SExtIdx = V; }
111 int getSExtIdx() const { return SExtIdx; }
112
113 void setRenameReg(MCPhysReg R) { RenameReg = R; }
114 void clearRenameReg() { RenameReg = std::nullopt; }
115 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; }
116 };
117
118 struct AArch64LoadStoreOpt : public MachineFunctionPass {
119 static char ID;
120
AArch64LoadStoreOpt__anon22729c170111::AArch64LoadStoreOpt121 AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
122 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
123 }
124
125 AliasAnalysis *AA;
126 const AArch64InstrInfo *TII;
127 const TargetRegisterInfo *TRI;
128 const AArch64Subtarget *Subtarget;
129
130 // Track which register units have been modified and used.
131 LiveRegUnits ModifiedRegUnits, UsedRegUnits;
132 LiveRegUnits DefinedInBB;
133
getAnalysisUsage__anon22729c170111::AArch64LoadStoreOpt134 void getAnalysisUsage(AnalysisUsage &AU) const override {
135 AU.addRequired<AAResultsWrapperPass>();
136 MachineFunctionPass::getAnalysisUsage(AU);
137 }
138
139 // Scan the instructions looking for a load/store that can be combined
140 // with the current instruction into a load/store pair.
141 // Return the matching instruction if one is found, else MBB->end().
142 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
143 LdStPairFlags &Flags,
144 unsigned Limit,
145 bool FindNarrowMerge);
146
147 // Scan the instructions looking for a store that writes to the address from
148 // which the current load instruction reads. Return true if one is found.
149 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
150 MachineBasicBlock::iterator &StoreI);
151
152 // Merge the two instructions indicated into a wider narrow store instruction.
153 MachineBasicBlock::iterator
154 mergeNarrowZeroStores(MachineBasicBlock::iterator I,
155 MachineBasicBlock::iterator MergeMI,
156 const LdStPairFlags &Flags);
157
158 // Merge the two instructions indicated into a single pair-wise instruction.
159 MachineBasicBlock::iterator
160 mergePairedInsns(MachineBasicBlock::iterator I,
161 MachineBasicBlock::iterator Paired,
162 const LdStPairFlags &Flags);
163
164 // Promote the load that reads directly from the address stored to.
165 MachineBasicBlock::iterator
166 promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
167 MachineBasicBlock::iterator StoreI);
168
169 // Scan the instruction list to find a base register update that can
170 // be combined with the current instruction (a load or store) using
171 // pre or post indexed addressing with writeback. Scan forwards.
172 MachineBasicBlock::iterator
173 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
174 int UnscaledOffset, unsigned Limit);
175
176 // Scan the instruction list to find a base register update that can
177 // be combined with the current instruction (a load or store) using
178 // pre or post indexed addressing with writeback. Scan backwards.
179 MachineBasicBlock::iterator
180 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
181
182 // Find an instruction that updates the base register of the ld/st
183 // instruction.
184 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
185 unsigned BaseReg, int Offset);
186
187 // Merge a pre- or post-index base register update into a ld/st instruction.
188 MachineBasicBlock::iterator
189 mergeUpdateInsn(MachineBasicBlock::iterator I,
190 MachineBasicBlock::iterator Update, bool IsPreIdx);
191
192 // Find and merge zero store instructions.
193 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
194
195 // Find and pair ldr/str instructions.
196 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
197
198 // Find and promote load instructions which read directly from store.
199 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
200
201 // Find and merge a base register updates before or after a ld/st instruction.
202 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
203
204 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205
206 bool runOnMachineFunction(MachineFunction &Fn) override;
207
getRequiredProperties__anon22729c170111::AArch64LoadStoreOpt208 MachineFunctionProperties getRequiredProperties() const override {
209 return MachineFunctionProperties().set(
210 MachineFunctionProperties::Property::NoVRegs);
211 }
212
getPassName__anon22729c170111::AArch64LoadStoreOpt213 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
214 };
215
216 char AArch64LoadStoreOpt::ID = 0;
217
218 } // end anonymous namespace
219
220 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
221 AARCH64_LOAD_STORE_OPT_NAME, false, false)
222
isNarrowStore(unsigned Opc)223 static bool isNarrowStore(unsigned Opc) {
224 switch (Opc) {
225 default:
226 return false;
227 case AArch64::STRBBui:
228 case AArch64::STURBBi:
229 case AArch64::STRHHui:
230 case AArch64::STURHHi:
231 return true;
232 }
233 }
234
235 // These instruction set memory tag and either keep memory contents unchanged or
236 // set it to zero, ignoring the address part of the source register.
isTagStore(const MachineInstr & MI)237 static bool isTagStore(const MachineInstr &MI) {
238 switch (MI.getOpcode()) {
239 default:
240 return false;
241 case AArch64::STGi:
242 case AArch64::STZGi:
243 case AArch64::ST2Gi:
244 case AArch64::STZ2Gi:
245 return true;
246 }
247 }
248
getMatchingNonSExtOpcode(unsigned Opc,bool * IsValidLdStrOpc=nullptr)249 static unsigned getMatchingNonSExtOpcode(unsigned Opc,
250 bool *IsValidLdStrOpc = nullptr) {
251 if (IsValidLdStrOpc)
252 *IsValidLdStrOpc = true;
253 switch (Opc) {
254 default:
255 if (IsValidLdStrOpc)
256 *IsValidLdStrOpc = false;
257 return std::numeric_limits<unsigned>::max();
258 case AArch64::STRDui:
259 case AArch64::STURDi:
260 case AArch64::STRDpre:
261 case AArch64::STRQui:
262 case AArch64::STURQi:
263 case AArch64::STRQpre:
264 case AArch64::STRBBui:
265 case AArch64::STURBBi:
266 case AArch64::STRHHui:
267 case AArch64::STURHHi:
268 case AArch64::STRWui:
269 case AArch64::STRWpre:
270 case AArch64::STURWi:
271 case AArch64::STRXui:
272 case AArch64::STRXpre:
273 case AArch64::STURXi:
274 case AArch64::LDRDui:
275 case AArch64::LDURDi:
276 case AArch64::LDRDpre:
277 case AArch64::LDRQui:
278 case AArch64::LDURQi:
279 case AArch64::LDRQpre:
280 case AArch64::LDRWui:
281 case AArch64::LDURWi:
282 case AArch64::LDRWpre:
283 case AArch64::LDRXui:
284 case AArch64::LDURXi:
285 case AArch64::LDRXpre:
286 case AArch64::STRSui:
287 case AArch64::STURSi:
288 case AArch64::STRSpre:
289 case AArch64::LDRSui:
290 case AArch64::LDURSi:
291 case AArch64::LDRSpre:
292 return Opc;
293 case AArch64::LDRSWui:
294 return AArch64::LDRWui;
295 case AArch64::LDURSWi:
296 return AArch64::LDURWi;
297 case AArch64::LDRSWpre:
298 return AArch64::LDRWpre;
299 }
300 }
301
getMatchingWideOpcode(unsigned Opc)302 static unsigned getMatchingWideOpcode(unsigned Opc) {
303 switch (Opc) {
304 default:
305 llvm_unreachable("Opcode has no wide equivalent!");
306 case AArch64::STRBBui:
307 return AArch64::STRHHui;
308 case AArch64::STRHHui:
309 return AArch64::STRWui;
310 case AArch64::STURBBi:
311 return AArch64::STURHHi;
312 case AArch64::STURHHi:
313 return AArch64::STURWi;
314 case AArch64::STURWi:
315 return AArch64::STURXi;
316 case AArch64::STRWui:
317 return AArch64::STRXui;
318 }
319 }
320
getMatchingPairOpcode(unsigned Opc)321 static unsigned getMatchingPairOpcode(unsigned Opc) {
322 switch (Opc) {
323 default:
324 llvm_unreachable("Opcode has no pairwise equivalent!");
325 case AArch64::STRSui:
326 case AArch64::STURSi:
327 return AArch64::STPSi;
328 case AArch64::STRSpre:
329 return AArch64::STPSpre;
330 case AArch64::STRDui:
331 case AArch64::STURDi:
332 return AArch64::STPDi;
333 case AArch64::STRDpre:
334 return AArch64::STPDpre;
335 case AArch64::STRQui:
336 case AArch64::STURQi:
337 return AArch64::STPQi;
338 case AArch64::STRQpre:
339 return AArch64::STPQpre;
340 case AArch64::STRWui:
341 case AArch64::STURWi:
342 return AArch64::STPWi;
343 case AArch64::STRWpre:
344 return AArch64::STPWpre;
345 case AArch64::STRXui:
346 case AArch64::STURXi:
347 return AArch64::STPXi;
348 case AArch64::STRXpre:
349 return AArch64::STPXpre;
350 case AArch64::LDRSui:
351 case AArch64::LDURSi:
352 return AArch64::LDPSi;
353 case AArch64::LDRSpre:
354 return AArch64::LDPSpre;
355 case AArch64::LDRDui:
356 case AArch64::LDURDi:
357 return AArch64::LDPDi;
358 case AArch64::LDRDpre:
359 return AArch64::LDPDpre;
360 case AArch64::LDRQui:
361 case AArch64::LDURQi:
362 return AArch64::LDPQi;
363 case AArch64::LDRQpre:
364 return AArch64::LDPQpre;
365 case AArch64::LDRWui:
366 case AArch64::LDURWi:
367 return AArch64::LDPWi;
368 case AArch64::LDRWpre:
369 return AArch64::LDPWpre;
370 case AArch64::LDRXui:
371 case AArch64::LDURXi:
372 return AArch64::LDPXi;
373 case AArch64::LDRXpre:
374 return AArch64::LDPXpre;
375 case AArch64::LDRSWui:
376 case AArch64::LDURSWi:
377 return AArch64::LDPSWi;
378 case AArch64::LDRSWpre:
379 return AArch64::LDPSWpre;
380 }
381 }
382
isMatchingStore(MachineInstr & LoadInst,MachineInstr & StoreInst)383 static unsigned isMatchingStore(MachineInstr &LoadInst,
384 MachineInstr &StoreInst) {
385 unsigned LdOpc = LoadInst.getOpcode();
386 unsigned StOpc = StoreInst.getOpcode();
387 switch (LdOpc) {
388 default:
389 llvm_unreachable("Unsupported load instruction!");
390 case AArch64::LDRBBui:
391 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
392 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
393 case AArch64::LDURBBi:
394 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
395 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
396 case AArch64::LDRHHui:
397 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
398 StOpc == AArch64::STRXui;
399 case AArch64::LDURHHi:
400 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
401 StOpc == AArch64::STURXi;
402 case AArch64::LDRWui:
403 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
404 case AArch64::LDURWi:
405 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
406 case AArch64::LDRXui:
407 return StOpc == AArch64::STRXui;
408 case AArch64::LDURXi:
409 return StOpc == AArch64::STURXi;
410 }
411 }
412
getPreIndexedOpcode(unsigned Opc)413 static unsigned getPreIndexedOpcode(unsigned Opc) {
414 // FIXME: We don't currently support creating pre-indexed loads/stores when
415 // the load or store is the unscaled version. If we decide to perform such an
416 // optimization in the future the cases for the unscaled loads/stores will
417 // need to be added here.
418 switch (Opc) {
419 default:
420 llvm_unreachable("Opcode has no pre-indexed equivalent!");
421 case AArch64::STRSui:
422 return AArch64::STRSpre;
423 case AArch64::STRDui:
424 return AArch64::STRDpre;
425 case AArch64::STRQui:
426 return AArch64::STRQpre;
427 case AArch64::STRBBui:
428 return AArch64::STRBBpre;
429 case AArch64::STRHHui:
430 return AArch64::STRHHpre;
431 case AArch64::STRWui:
432 return AArch64::STRWpre;
433 case AArch64::STRXui:
434 return AArch64::STRXpre;
435 case AArch64::LDRSui:
436 return AArch64::LDRSpre;
437 case AArch64::LDRDui:
438 return AArch64::LDRDpre;
439 case AArch64::LDRQui:
440 return AArch64::LDRQpre;
441 case AArch64::LDRBBui:
442 return AArch64::LDRBBpre;
443 case AArch64::LDRHHui:
444 return AArch64::LDRHHpre;
445 case AArch64::LDRWui:
446 return AArch64::LDRWpre;
447 case AArch64::LDRXui:
448 return AArch64::LDRXpre;
449 case AArch64::LDRSWui:
450 return AArch64::LDRSWpre;
451 case AArch64::LDPSi:
452 return AArch64::LDPSpre;
453 case AArch64::LDPSWi:
454 return AArch64::LDPSWpre;
455 case AArch64::LDPDi:
456 return AArch64::LDPDpre;
457 case AArch64::LDPQi:
458 return AArch64::LDPQpre;
459 case AArch64::LDPWi:
460 return AArch64::LDPWpre;
461 case AArch64::LDPXi:
462 return AArch64::LDPXpre;
463 case AArch64::STPSi:
464 return AArch64::STPSpre;
465 case AArch64::STPDi:
466 return AArch64::STPDpre;
467 case AArch64::STPQi:
468 return AArch64::STPQpre;
469 case AArch64::STPWi:
470 return AArch64::STPWpre;
471 case AArch64::STPXi:
472 return AArch64::STPXpre;
473 case AArch64::STGi:
474 return AArch64::STGPreIndex;
475 case AArch64::STZGi:
476 return AArch64::STZGPreIndex;
477 case AArch64::ST2Gi:
478 return AArch64::ST2GPreIndex;
479 case AArch64::STZ2Gi:
480 return AArch64::STZ2GPreIndex;
481 case AArch64::STGPi:
482 return AArch64::STGPpre;
483 }
484 }
485
getPostIndexedOpcode(unsigned Opc)486 static unsigned getPostIndexedOpcode(unsigned Opc) {
487 switch (Opc) {
488 default:
489 llvm_unreachable("Opcode has no post-indexed wise equivalent!");
490 case AArch64::STRSui:
491 case AArch64::STURSi:
492 return AArch64::STRSpost;
493 case AArch64::STRDui:
494 case AArch64::STURDi:
495 return AArch64::STRDpost;
496 case AArch64::STRQui:
497 case AArch64::STURQi:
498 return AArch64::STRQpost;
499 case AArch64::STRBBui:
500 return AArch64::STRBBpost;
501 case AArch64::STRHHui:
502 return AArch64::STRHHpost;
503 case AArch64::STRWui:
504 case AArch64::STURWi:
505 return AArch64::STRWpost;
506 case AArch64::STRXui:
507 case AArch64::STURXi:
508 return AArch64::STRXpost;
509 case AArch64::LDRSui:
510 case AArch64::LDURSi:
511 return AArch64::LDRSpost;
512 case AArch64::LDRDui:
513 case AArch64::LDURDi:
514 return AArch64::LDRDpost;
515 case AArch64::LDRQui:
516 case AArch64::LDURQi:
517 return AArch64::LDRQpost;
518 case AArch64::LDRBBui:
519 return AArch64::LDRBBpost;
520 case AArch64::LDRHHui:
521 return AArch64::LDRHHpost;
522 case AArch64::LDRWui:
523 case AArch64::LDURWi:
524 return AArch64::LDRWpost;
525 case AArch64::LDRXui:
526 case AArch64::LDURXi:
527 return AArch64::LDRXpost;
528 case AArch64::LDRSWui:
529 return AArch64::LDRSWpost;
530 case AArch64::LDPSi:
531 return AArch64::LDPSpost;
532 case AArch64::LDPSWi:
533 return AArch64::LDPSWpost;
534 case AArch64::LDPDi:
535 return AArch64::LDPDpost;
536 case AArch64::LDPQi:
537 return AArch64::LDPQpost;
538 case AArch64::LDPWi:
539 return AArch64::LDPWpost;
540 case AArch64::LDPXi:
541 return AArch64::LDPXpost;
542 case AArch64::STPSi:
543 return AArch64::STPSpost;
544 case AArch64::STPDi:
545 return AArch64::STPDpost;
546 case AArch64::STPQi:
547 return AArch64::STPQpost;
548 case AArch64::STPWi:
549 return AArch64::STPWpost;
550 case AArch64::STPXi:
551 return AArch64::STPXpost;
552 case AArch64::STGi:
553 return AArch64::STGPostIndex;
554 case AArch64::STZGi:
555 return AArch64::STZGPostIndex;
556 case AArch64::ST2Gi:
557 return AArch64::ST2GPostIndex;
558 case AArch64::STZ2Gi:
559 return AArch64::STZ2GPostIndex;
560 case AArch64::STGPi:
561 return AArch64::STGPpost;
562 }
563 }
564
isPreLdStPairCandidate(MachineInstr & FirstMI,MachineInstr & MI)565 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) {
566
567 unsigned OpcA = FirstMI.getOpcode();
568 unsigned OpcB = MI.getOpcode();
569
570 switch (OpcA) {
571 default:
572 return false;
573 case AArch64::STRSpre:
574 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi);
575 case AArch64::STRDpre:
576 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi);
577 case AArch64::STRQpre:
578 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi);
579 case AArch64::STRWpre:
580 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi);
581 case AArch64::STRXpre:
582 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi);
583 case AArch64::LDRSpre:
584 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi);
585 case AArch64::LDRDpre:
586 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi);
587 case AArch64::LDRQpre:
588 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi);
589 case AArch64::LDRWpre:
590 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi);
591 case AArch64::LDRXpre:
592 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi);
593 case AArch64::LDRSWpre:
594 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi);
595 }
596 }
597
598 // Returns the scale and offset range of pre/post indexed variants of MI.
getPrePostIndexedMemOpInfo(const MachineInstr & MI,int & Scale,int & MinOffset,int & MaxOffset)599 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
600 int &MinOffset, int &MaxOffset) {
601 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI);
602 bool IsTagStore = isTagStore(MI);
603 // ST*G and all paired ldst have the same scale in pre/post-indexed variants
604 // as in the "unsigned offset" variant.
605 // All other pre/post indexed ldst instructions are unscaled.
606 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
607
608 if (IsPaired) {
609 MinOffset = -64;
610 MaxOffset = 63;
611 } else {
612 MinOffset = -256;
613 MaxOffset = 255;
614 }
615 }
616
getLdStRegOp(MachineInstr & MI,unsigned PairedRegOp=0)617 static MachineOperand &getLdStRegOp(MachineInstr &MI,
618 unsigned PairedRegOp = 0) {
619 assert(PairedRegOp < 2 && "Unexpected register operand idx.");
620 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI);
621 if (IsPreLdSt)
622 PairedRegOp += 1;
623 unsigned Idx =
624 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0;
625 return MI.getOperand(Idx);
626 }
627
isLdOffsetInRangeOfSt(MachineInstr & LoadInst,MachineInstr & StoreInst,const AArch64InstrInfo * TII)628 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
629 MachineInstr &StoreInst,
630 const AArch64InstrInfo *TII) {
631 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
632 int LoadSize = TII->getMemScale(LoadInst);
633 int StoreSize = TII->getMemScale(StoreInst);
634 int UnscaledStOffset =
635 TII->hasUnscaledLdStOffset(StoreInst)
636 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm()
637 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize;
638 int UnscaledLdOffset =
639 TII->hasUnscaledLdStOffset(LoadInst)
640 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm()
641 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize;
642 return (UnscaledStOffset <= UnscaledLdOffset) &&
643 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
644 }
645
isPromotableZeroStoreInst(MachineInstr & MI)646 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
647 unsigned Opc = MI.getOpcode();
648 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
649 isNarrowStore(Opc)) &&
650 getLdStRegOp(MI).getReg() == AArch64::WZR;
651 }
652
isPromotableLoadFromStore(MachineInstr & MI)653 static bool isPromotableLoadFromStore(MachineInstr &MI) {
654 switch (MI.getOpcode()) {
655 default:
656 return false;
657 // Scaled instructions.
658 case AArch64::LDRBBui:
659 case AArch64::LDRHHui:
660 case AArch64::LDRWui:
661 case AArch64::LDRXui:
662 // Unscaled instructions.
663 case AArch64::LDURBBi:
664 case AArch64::LDURHHi:
665 case AArch64::LDURWi:
666 case AArch64::LDURXi:
667 return true;
668 }
669 }
670
isMergeableLdStUpdate(MachineInstr & MI)671 static bool isMergeableLdStUpdate(MachineInstr &MI) {
672 unsigned Opc = MI.getOpcode();
673 switch (Opc) {
674 default:
675 return false;
676 // Scaled instructions.
677 case AArch64::STRSui:
678 case AArch64::STRDui:
679 case AArch64::STRQui:
680 case AArch64::STRXui:
681 case AArch64::STRWui:
682 case AArch64::STRHHui:
683 case AArch64::STRBBui:
684 case AArch64::LDRSui:
685 case AArch64::LDRDui:
686 case AArch64::LDRQui:
687 case AArch64::LDRXui:
688 case AArch64::LDRWui:
689 case AArch64::LDRHHui:
690 case AArch64::LDRBBui:
691 case AArch64::STGi:
692 case AArch64::STZGi:
693 case AArch64::ST2Gi:
694 case AArch64::STZ2Gi:
695 case AArch64::STGPi:
696 // Unscaled instructions.
697 case AArch64::STURSi:
698 case AArch64::STURDi:
699 case AArch64::STURQi:
700 case AArch64::STURWi:
701 case AArch64::STURXi:
702 case AArch64::LDURSi:
703 case AArch64::LDURDi:
704 case AArch64::LDURQi:
705 case AArch64::LDURWi:
706 case AArch64::LDURXi:
707 // Paired instructions.
708 case AArch64::LDPSi:
709 case AArch64::LDPSWi:
710 case AArch64::LDPDi:
711 case AArch64::LDPQi:
712 case AArch64::LDPWi:
713 case AArch64::LDPXi:
714 case AArch64::STPSi:
715 case AArch64::STPDi:
716 case AArch64::STPQi:
717 case AArch64::STPWi:
718 case AArch64::STPXi:
719 // Make sure this is a reg+imm (as opposed to an address reloc).
720 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
721 return false;
722
723 return true;
724 }
725 }
726
isRewritableImplicitDef(unsigned Opc)727 static bool isRewritableImplicitDef(unsigned Opc) {
728 switch (Opc) {
729 default:
730 return false;
731 case AArch64::ORRWrs:
732 case AArch64::ADDWri:
733 return true;
734 }
735 }
736
737 MachineBasicBlock::iterator
mergeNarrowZeroStores(MachineBasicBlock::iterator I,MachineBasicBlock::iterator MergeMI,const LdStPairFlags & Flags)738 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
739 MachineBasicBlock::iterator MergeMI,
740 const LdStPairFlags &Flags) {
741 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
742 "Expected promotable zero stores.");
743
744 MachineBasicBlock::iterator E = I->getParent()->end();
745 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
746 // If NextI is the second of the two instructions to be merged, we need
747 // to skip one further. Either way we merge will invalidate the iterator,
748 // and we don't need to scan the new instruction, as it's a pairwise
749 // instruction, which we're not considering for further action anyway.
750 if (NextI == MergeMI)
751 NextI = next_nodbg(NextI, E);
752
753 unsigned Opc = I->getOpcode();
754 unsigned MergeMIOpc = MergeMI->getOpcode();
755 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc);
756 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc);
757 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1;
758 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1;
759
760 bool MergeForward = Flags.getMergeForward();
761 // Insert our new paired instruction after whichever of the paired
762 // instructions MergeForward indicates.
763 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
764 // Also based on MergeForward is from where we copy the base register operand
765 // so we get the flags compatible with the input code.
766 const MachineOperand &BaseRegOp =
767 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI)
768 : AArch64InstrInfo::getLdStBaseOp(*I);
769
770 // Which register is Rt and which is Rt2 depends on the offset order.
771 int64_t IOffsetInBytes =
772 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride;
773 int64_t MIOffsetInBytes =
774 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() *
775 MergeMIOffsetStride;
776 // Select final offset based on the offset order.
777 int64_t OffsetImm;
778 if (IOffsetInBytes > MIOffsetInBytes)
779 OffsetImm = MIOffsetInBytes;
780 else
781 OffsetImm = IOffsetInBytes;
782
783 int NewOpcode = getMatchingWideOpcode(Opc);
784 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode);
785
786 // Adjust final offset if the result opcode is a scaled store.
787 if (FinalIsScaled) {
788 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1;
789 assert(((OffsetImm % NewOffsetStride) == 0) &&
790 "Offset should be a multiple of the store memory scale");
791 OffsetImm = OffsetImm / NewOffsetStride;
792 }
793
794 // Construct the new instruction.
795 DebugLoc DL = I->getDebugLoc();
796 MachineBasicBlock *MBB = I->getParent();
797 MachineInstrBuilder MIB;
798 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
799 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
800 .add(BaseRegOp)
801 .addImm(OffsetImm)
802 .cloneMergedMemRefs({&*I, &*MergeMI})
803 .setMIFlags(I->mergeFlagsWith(*MergeMI));
804 (void)MIB;
805
806 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n ");
807 LLVM_DEBUG(I->print(dbgs()));
808 LLVM_DEBUG(dbgs() << " ");
809 LLVM_DEBUG(MergeMI->print(dbgs()));
810 LLVM_DEBUG(dbgs() << " with instruction:\n ");
811 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
812 LLVM_DEBUG(dbgs() << "\n");
813
814 // Erase the old instructions.
815 I->eraseFromParent();
816 MergeMI->eraseFromParent();
817 return NextI;
818 }
819
820 // Apply Fn to all instructions between MI and the beginning of the block, until
821 // a def for DefReg is reached. Returns true, iff Fn returns true for all
822 // visited instructions. Stop after visiting Limit iterations.
forAllMIsUntilDef(MachineInstr & MI,MCPhysReg DefReg,const TargetRegisterInfo * TRI,unsigned Limit,std::function<bool (MachineInstr &,bool)> & Fn)823 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
824 const TargetRegisterInfo *TRI, unsigned Limit,
825 std::function<bool(MachineInstr &, bool)> &Fn) {
826 auto MBB = MI.getParent();
827 for (MachineInstr &I :
828 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
829 if (!Limit)
830 return false;
831 --Limit;
832
833 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
834 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
835 TRI->regsOverlap(MOP.getReg(), DefReg);
836 });
837 if (!Fn(I, isDef))
838 return false;
839 if (isDef)
840 break;
841 }
842 return true;
843 }
844
updateDefinedRegisters(MachineInstr & MI,LiveRegUnits & Units,const TargetRegisterInfo * TRI)845 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
846 const TargetRegisterInfo *TRI) {
847
848 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
849 if (MOP.isReg() && MOP.isKill())
850 Units.removeReg(MOP.getReg());
851
852 for (const MachineOperand &MOP : phys_regs_and_masks(MI))
853 if (MOP.isReg() && !MOP.isKill())
854 Units.addReg(MOP.getReg());
855 }
856
857 MachineBasicBlock::iterator
mergePairedInsns(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,const LdStPairFlags & Flags)858 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
859 MachineBasicBlock::iterator Paired,
860 const LdStPairFlags &Flags) {
861 MachineBasicBlock::iterator E = I->getParent()->end();
862 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
863 // If NextI is the second of the two instructions to be merged, we need
864 // to skip one further. Either way we merge will invalidate the iterator,
865 // and we don't need to scan the new instruction, as it's a pairwise
866 // instruction, which we're not considering for further action anyway.
867 if (NextI == Paired)
868 NextI = next_nodbg(NextI, E);
869
870 int SExtIdx = Flags.getSExtIdx();
871 unsigned Opc =
872 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
873 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc);
874 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
875
876 bool MergeForward = Flags.getMergeForward();
877
878 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg();
879 if (RenameReg) {
880 MCRegister RegToRename = getLdStRegOp(*I).getReg();
881 DefinedInBB.addReg(*RenameReg);
882
883 // Return the sub/super register for RenameReg, matching the size of
884 // OriginalReg.
885 auto GetMatchingSubReg =
886 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
887 for (MCPhysReg SubOrSuper :
888 TRI->sub_and_superregs_inclusive(*RenameReg)) {
889 if (C->contains(SubOrSuper))
890 return SubOrSuper;
891 }
892 llvm_unreachable("Should have found matching sub or super register!");
893 };
894
895 std::function<bool(MachineInstr &, bool)> UpdateMIs =
896 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI,
897 bool IsDef) {
898 if (IsDef) {
899 bool SeenDef = false;
900 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
901 MachineOperand &MOP = MI.getOperand(OpIdx);
902 // Rename the first explicit definition and all implicit
903 // definitions matching RegToRename.
904 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
905 (!MergeForward || !SeenDef ||
906 (MOP.isDef() && MOP.isImplicit())) &&
907 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
908 assert((MOP.isImplicit() ||
909 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
910 "Need renamable operands");
911 Register MatchingReg;
912 if (const TargetRegisterClass *RC =
913 MI.getRegClassConstraint(OpIdx, TII, TRI))
914 MatchingReg = GetMatchingSubReg(RC);
915 else {
916 if (!isRewritableImplicitDef(MI.getOpcode()))
917 continue;
918 MatchingReg = GetMatchingSubReg(
919 TRI->getMinimalPhysRegClass(MOP.getReg()));
920 }
921 MOP.setReg(MatchingReg);
922 SeenDef = true;
923 }
924 }
925 } else {
926 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
927 MachineOperand &MOP = MI.getOperand(OpIdx);
928 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
929 TRI->regsOverlap(MOP.getReg(), RegToRename)) {
930 assert((MOP.isImplicit() ||
931 (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
932 "Need renamable operands");
933 Register MatchingReg;
934 if (const TargetRegisterClass *RC =
935 MI.getRegClassConstraint(OpIdx, TII, TRI))
936 MatchingReg = GetMatchingSubReg(RC);
937 else
938 MatchingReg = GetMatchingSubReg(
939 TRI->getMinimalPhysRegClass(MOP.getReg()));
940 assert(MatchingReg != AArch64::NoRegister &&
941 "Cannot find matching regs for renaming");
942 MOP.setReg(MatchingReg);
943 }
944 }
945 }
946 LLVM_DEBUG(dbgs() << "Renamed " << MI);
947 return true;
948 };
949 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI,
950 UINT32_MAX, UpdateMIs);
951
952 #if !defined(NDEBUG)
953 // For forward merging store:
954 // Make sure the register used for renaming is not used between the
955 // paired instructions. That would trash the content before the new
956 // paired instruction.
957 MCPhysReg RegToCheck = *RenameReg;
958 // For backward merging load:
959 // Make sure the register being renamed is not used between the
960 // paired instructions. That would trash the content after the new
961 // paired instruction.
962 if (!MergeForward)
963 RegToCheck = RegToRename;
964 for (auto &MI :
965 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
966 MergeForward ? std::next(I) : I,
967 MergeForward ? std::next(Paired) : Paired))
968 assert(all_of(MI.operands(),
969 [this, RegToCheck](const MachineOperand &MOP) {
970 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
971 MOP.isUndef() ||
972 !TRI->regsOverlap(MOP.getReg(), RegToCheck);
973 }) &&
974 "Rename register used between paired instruction, trashing the "
975 "content");
976 #endif
977 }
978
979 // Insert our new paired instruction after whichever of the paired
980 // instructions MergeForward indicates.
981 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
982 // Also based on MergeForward is from where we copy the base register operand
983 // so we get the flags compatible with the input code.
984 const MachineOperand &BaseRegOp =
985 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired)
986 : AArch64InstrInfo::getLdStBaseOp(*I);
987
988 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm();
989 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm();
990 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode());
991 if (IsUnscaled != PairedIsUnscaled) {
992 // We're trying to pair instructions that differ in how they are scaled. If
993 // I is scaled then scale the offset of Paired accordingly. Otherwise, do
994 // the opposite (i.e., make Paired's offset unscaled).
995 int MemSize = TII->getMemScale(*Paired);
996 if (PairedIsUnscaled) {
997 // If the unscaled offset isn't a multiple of the MemSize, we can't
998 // pair the operations together.
999 assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
1000 "Offset should be a multiple of the stride!");
1001 PairedOffset /= MemSize;
1002 } else {
1003 PairedOffset *= MemSize;
1004 }
1005 }
1006
1007 // Which register is Rt and which is Rt2 depends on the offset order.
1008 // However, for pre load/stores the Rt should be the one of the pre
1009 // load/store.
1010 MachineInstr *RtMI, *Rt2MI;
1011 if (Offset == PairedOffset + OffsetStride &&
1012 !AArch64InstrInfo::isPreLdSt(*I)) {
1013 RtMI = &*Paired;
1014 Rt2MI = &*I;
1015 // Here we swapped the assumption made for SExtIdx.
1016 // I.e., we turn ldp I, Paired into ldp Paired, I.
1017 // Update the index accordingly.
1018 if (SExtIdx != -1)
1019 SExtIdx = (SExtIdx + 1) % 2;
1020 } else {
1021 RtMI = &*I;
1022 Rt2MI = &*Paired;
1023 }
1024 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm();
1025 // Scale the immediate offset, if necessary.
1026 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) {
1027 assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
1028 "Unscaled offset cannot be scaled.");
1029 OffsetImm /= TII->getMemScale(*RtMI);
1030 }
1031
1032 // Construct the new instruction.
1033 MachineInstrBuilder MIB;
1034 DebugLoc DL = I->getDebugLoc();
1035 MachineBasicBlock *MBB = I->getParent();
1036 MachineOperand RegOp0 = getLdStRegOp(*RtMI);
1037 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI);
1038 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1;
1039 // Kill flags may become invalid when moving stores for pairing.
1040 if (RegOp0.isUse()) {
1041 if (!MergeForward) {
1042 // Clear kill flags on store if moving upwards. Example:
1043 // STRWui kill %w0, ...
1044 // USE %w1
1045 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards
1046 // We are about to move the store of w1, so its kill flag may become
1047 // invalid; not the case for w0.
1048 // Since w1 is used between the stores, the kill flag on w1 is cleared
1049 // after merging.
1050 // STPWi kill %w0, %w1, ...
1051 // USE %w1
1052 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It)
1053 if (It->readsRegister(PairedRegOp.getReg(), TRI))
1054 PairedRegOp.setIsKill(false);
1055 } else {
1056 // Clear kill flags of the first stores register. Example:
1057 // STRWui %w1, ...
1058 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards
1059 // STRW %w0
1060 Register Reg = getLdStRegOp(*I).getReg();
1061 for (MachineInstr &MI : make_range(std::next(I), Paired))
1062 MI.clearRegisterKills(Reg, TRI);
1063 }
1064 }
1065
1066 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc);
1067 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode));
1068
1069 // Adds the pre-index operand for pre-indexed ld/st pairs.
1070 if (AArch64InstrInfo::isPreLdSt(*RtMI))
1071 MIB.addReg(BaseRegOp.getReg(), RegState::Define);
1072
1073 MIB.add(RegOp0)
1074 .add(RegOp1)
1075 .add(BaseRegOp)
1076 .addImm(OffsetImm)
1077 .cloneMergedMemRefs({&*I, &*Paired})
1078 .setMIFlags(I->mergeFlagsWith(*Paired));
1079
1080 (void)MIB;
1081
1082 LLVM_DEBUG(
1083 dbgs() << "Creating pair load/store. Replacing instructions:\n ");
1084 LLVM_DEBUG(I->print(dbgs()));
1085 LLVM_DEBUG(dbgs() << " ");
1086 LLVM_DEBUG(Paired->print(dbgs()));
1087 LLVM_DEBUG(dbgs() << " with instruction:\n ");
1088 if (SExtIdx != -1) {
1089 // Generate the sign extension for the proper result of the ldp.
1090 // I.e., with X1, that would be:
1091 // %w1 = KILL %w1, implicit-def %x1
1092 // %x1 = SBFMXri killed %x1, 0, 31
1093 MachineOperand &DstMO = MIB->getOperand(SExtIdx);
1094 // Right now, DstMO has the extended register, since it comes from an
1095 // extended opcode.
1096 Register DstRegX = DstMO.getReg();
1097 // Get the W variant of that register.
1098 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32);
1099 // Update the result of LDP to use the W instead of the X variant.
1100 DstMO.setReg(DstRegW);
1101 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1102 LLVM_DEBUG(dbgs() << "\n");
1103 // Make the machine verifier happy by providing a definition for
1104 // the X register.
1105 // Insert this definition right after the generated LDP, i.e., before
1106 // InsertionPoint.
1107 MachineInstrBuilder MIBKill =
1108 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
1109 .addReg(DstRegW)
1110 .addReg(DstRegX, RegState::Define);
1111 MIBKill->getOperand(2).setImplicit();
1112 // Create the sign extension.
1113 MachineInstrBuilder MIBSXTW =
1114 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
1115 .addReg(DstRegX)
1116 .addImm(0)
1117 .addImm(31);
1118 (void)MIBSXTW;
1119 LLVM_DEBUG(dbgs() << " Extend operand:\n ");
1120 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1121 } else {
1122 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
1123 }
1124 LLVM_DEBUG(dbgs() << "\n");
1125
1126 if (MergeForward)
1127 for (const MachineOperand &MOP : phys_regs_and_masks(*I))
1128 if (MOP.isReg() && MOP.isKill())
1129 DefinedInBB.addReg(MOP.getReg());
1130
1131 // Erase the old instructions.
1132 I->eraseFromParent();
1133 Paired->eraseFromParent();
1134
1135 return NextI;
1136 }
1137
1138 MachineBasicBlock::iterator
promoteLoadFromStore(MachineBasicBlock::iterator LoadI,MachineBasicBlock::iterator StoreI)1139 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
1140 MachineBasicBlock::iterator StoreI) {
1141 MachineBasicBlock::iterator NextI =
1142 next_nodbg(LoadI, LoadI->getParent()->end());
1143
1144 int LoadSize = TII->getMemScale(*LoadI);
1145 int StoreSize = TII->getMemScale(*StoreI);
1146 Register LdRt = getLdStRegOp(*LoadI).getReg();
1147 const MachineOperand &StMO = getLdStRegOp(*StoreI);
1148 Register StRt = getLdStRegOp(*StoreI).getReg();
1149 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
1150
1151 assert((IsStoreXReg ||
1152 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
1153 "Unexpected RegClass");
1154
1155 MachineInstr *BitExtMI;
1156 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) {
1157 // Remove the load, if the destination register of the loads is the same
1158 // register for stored value.
1159 if (StRt == LdRt && LoadSize == 8) {
1160 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1161 LoadI->getIterator())) {
1162 if (MI.killsRegister(StRt, TRI)) {
1163 MI.clearRegisterKills(StRt, TRI);
1164 break;
1165 }
1166 }
1167 LLVM_DEBUG(dbgs() << "Remove load instruction:\n ");
1168 LLVM_DEBUG(LoadI->print(dbgs()));
1169 LLVM_DEBUG(dbgs() << "\n");
1170 LoadI->eraseFromParent();
1171 return NextI;
1172 }
1173 // Replace the load with a mov if the load and store are in the same size.
1174 BitExtMI =
1175 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1176 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
1177 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
1178 .add(StMO)
1179 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
1180 .setMIFlags(LoadI->getFlags());
1181 } else {
1182 // FIXME: Currently we disable this transformation in big-endian targets as
1183 // performance and correctness are verified only in little-endian.
1184 if (!Subtarget->isLittleEndian())
1185 return NextI;
1186 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI);
1187 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) &&
1188 "Unsupported ld/st match");
1189 assert(LoadSize <= StoreSize && "Invalid load size");
1190 int UnscaledLdOffset =
1191 IsUnscaled
1192 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm()
1193 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize;
1194 int UnscaledStOffset =
1195 IsUnscaled
1196 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm()
1197 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize;
1198 int Width = LoadSize * 8;
1199 Register DestReg =
1200 IsStoreXReg ? Register(TRI->getMatchingSuperReg(
1201 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
1202 : LdRt;
1203
1204 assert((UnscaledLdOffset >= UnscaledStOffset &&
1205 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
1206 "Invalid offset");
1207
1208 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
1209 int Imms = Immr + Width - 1;
1210 if (UnscaledLdOffset == UnscaledStOffset) {
1211 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
1212 | ((Immr) << 6) // immr
1213 | ((Imms) << 0) // imms
1214 ;
1215
1216 BitExtMI =
1217 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1218 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
1219 DestReg)
1220 .add(StMO)
1221 .addImm(AndMaskEncoded)
1222 .setMIFlags(LoadI->getFlags());
1223 } else {
1224 BitExtMI =
1225 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
1226 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
1227 DestReg)
1228 .add(StMO)
1229 .addImm(Immr)
1230 .addImm(Imms)
1231 .setMIFlags(LoadI->getFlags());
1232 }
1233 }
1234
1235 // Clear kill flags between store and load.
1236 for (MachineInstr &MI : make_range(StoreI->getIterator(),
1237 BitExtMI->getIterator()))
1238 if (MI.killsRegister(StRt, TRI)) {
1239 MI.clearRegisterKills(StRt, TRI);
1240 break;
1241 }
1242
1243 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n ");
1244 LLVM_DEBUG(StoreI->print(dbgs()));
1245 LLVM_DEBUG(dbgs() << " ");
1246 LLVM_DEBUG(LoadI->print(dbgs()));
1247 LLVM_DEBUG(dbgs() << " with instructions:\n ");
1248 LLVM_DEBUG(StoreI->print(dbgs()));
1249 LLVM_DEBUG(dbgs() << " ");
1250 LLVM_DEBUG((BitExtMI)->print(dbgs()));
1251 LLVM_DEBUG(dbgs() << "\n");
1252
1253 // Erase the old instructions.
1254 LoadI->eraseFromParent();
1255 return NextI;
1256 }
1257
inBoundsForPair(bool IsUnscaled,int Offset,int OffsetStride)1258 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
1259 // Convert the byte-offset used by unscaled into an "element" offset used
1260 // by the scaled pair load/store instructions.
1261 if (IsUnscaled) {
1262 // If the byte-offset isn't a multiple of the stride, there's no point
1263 // trying to match it.
1264 if (Offset % OffsetStride)
1265 return false;
1266 Offset /= OffsetStride;
1267 }
1268 return Offset <= 63 && Offset >= -64;
1269 }
1270
1271 // Do alignment, specialized to power of 2 and for signed ints,
1272 // avoiding having to do a C-style cast from uint_64t to int when
1273 // using alignTo from include/llvm/Support/MathExtras.h.
1274 // FIXME: Move this function to include/MathExtras.h?
alignTo(int Num,int PowOf2)1275 static int alignTo(int Num, int PowOf2) {
1276 return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
1277 }
1278
mayAlias(MachineInstr & MIa,SmallVectorImpl<MachineInstr * > & MemInsns,AliasAnalysis * AA)1279 static bool mayAlias(MachineInstr &MIa,
1280 SmallVectorImpl<MachineInstr *> &MemInsns,
1281 AliasAnalysis *AA) {
1282 for (MachineInstr *MIb : MemInsns) {
1283 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) {
1284 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump());
1285 return true;
1286 }
1287 }
1288
1289 LLVM_DEBUG(dbgs() << "No aliases found\n");
1290 return false;
1291 }
1292
findMatchingStore(MachineBasicBlock::iterator I,unsigned Limit,MachineBasicBlock::iterator & StoreI)1293 bool AArch64LoadStoreOpt::findMatchingStore(
1294 MachineBasicBlock::iterator I, unsigned Limit,
1295 MachineBasicBlock::iterator &StoreI) {
1296 MachineBasicBlock::iterator B = I->getParent()->begin();
1297 MachineBasicBlock::iterator MBBI = I;
1298 MachineInstr &LoadMI = *I;
1299 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg();
1300
1301 // If the load is the first instruction in the block, there's obviously
1302 // not any matching store.
1303 if (MBBI == B)
1304 return false;
1305
1306 // Track which register units have been modified and used between the first
1307 // insn and the second insn.
1308 ModifiedRegUnits.clear();
1309 UsedRegUnits.clear();
1310
1311 unsigned Count = 0;
1312 do {
1313 MBBI = prev_nodbg(MBBI, B);
1314 MachineInstr &MI = *MBBI;
1315
1316 // Don't count transient instructions towards the search limit since there
1317 // may be different numbers of them if e.g. debug information is present.
1318 if (!MI.isTransient())
1319 ++Count;
1320
1321 // If the load instruction reads directly from the address to which the
1322 // store instruction writes and the stored value is not modified, we can
1323 // promote the load. Since we do not handle stores with pre-/post-index,
1324 // it's unnecessary to check if BaseReg is modified by the store itself.
1325 // Also we can't handle stores without an immediate offset operand,
1326 // while the operand might be the address for a global variable.
1327 if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
1328 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() &&
1329 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() &&
1330 isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
1331 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) {
1332 StoreI = MBBI;
1333 return true;
1334 }
1335
1336 if (MI.isCall())
1337 return false;
1338
1339 // Update modified / uses register units.
1340 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1341
1342 // Otherwise, if the base register is modified, we have no match, so
1343 // return early.
1344 if (!ModifiedRegUnits.available(BaseReg))
1345 return false;
1346
1347 // If we encounter a store aliased with the load, return early.
1348 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
1349 return false;
1350 } while (MBBI != B && Count < Limit);
1351 return false;
1352 }
1353
needsWinCFI(const MachineFunction * MF)1354 static bool needsWinCFI(const MachineFunction *MF) {
1355 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1356 MF->getFunction().needsUnwindTableEntry();
1357 }
1358
1359 // Returns true if FirstMI and MI are candidates for merging or pairing.
1360 // Otherwise, returns false.
areCandidatesToMergeOrPair(MachineInstr & FirstMI,MachineInstr & MI,LdStPairFlags & Flags,const AArch64InstrInfo * TII)1361 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
1362 LdStPairFlags &Flags,
1363 const AArch64InstrInfo *TII) {
1364 // If this is volatile or if pairing is suppressed, not a candidate.
1365 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
1366 return false;
1367
1368 // We should have already checked FirstMI for pair suppression and volatility.
1369 assert(!FirstMI.hasOrderedMemoryRef() &&
1370 !TII->isLdStPairSuppressed(FirstMI) &&
1371 "FirstMI shouldn't get here if either of these checks are true.");
1372
1373 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) ||
1374 MI.getFlag(MachineInstr::FrameDestroy)))
1375 return false;
1376
1377 unsigned OpcA = FirstMI.getOpcode();
1378 unsigned OpcB = MI.getOpcode();
1379
1380 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check.
1381 if (OpcA == OpcB)
1382 return !AArch64InstrInfo::isPreLdSt(FirstMI);
1383
1384 // Two pre ld/st of different opcodes cannot be merged either
1385 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI))
1386 return false;
1387
1388 // Try to match a sign-extended load/store with a zero-extended load/store.
1389 bool IsValidLdStrOpc, PairIsValidLdStrOpc;
1390 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
1391 assert(IsValidLdStrOpc &&
1392 "Given Opc should be a Load or Store with an immediate");
1393 // OpcA will be the first instruction in the pair.
1394 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
1395 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
1396 return true;
1397 }
1398
1399 // If the second instruction isn't even a mergable/pairable load/store, bail
1400 // out.
1401 if (!PairIsValidLdStrOpc)
1402 return false;
1403
1404 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
1405 // offsets.
1406 if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
1407 return false;
1408
1409 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and
1410 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui
1411 // are candidate pairs that can be merged.
1412 if (isPreLdStPairCandidate(FirstMI, MI))
1413 return true;
1414
1415 // Try to match an unscaled load/store with a scaled load/store.
1416 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) &&
1417 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
1418
1419 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
1420 }
1421
canRenameMOP(const MachineOperand & MOP,const TargetRegisterInfo * TRI)1422 static bool canRenameMOP(const MachineOperand &MOP,
1423 const TargetRegisterInfo *TRI) {
1424 if (MOP.isReg()) {
1425 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
1426 // Renaming registers with multiple disjunct sub-registers (e.g. the
1427 // result of a LD3) means that all sub-registers are renamed, potentially
1428 // impacting other instructions we did not check. Bail out.
1429 // Note that this relies on the structure of the AArch64 register file. In
1430 // particular, a subregister cannot be written without overwriting the
1431 // whole register.
1432 if (RegClass->HasDisjunctSubRegs) {
1433 LLVM_DEBUG(
1434 dbgs()
1435 << " Cannot rename operands with multiple disjunct subregisters ("
1436 << MOP << ")\n");
1437 return false;
1438 }
1439
1440 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
1441 // them must be known. For example, in ORRWrs the implicit-def
1442 // corresponds to the result register.
1443 if (MOP.isImplicit() && MOP.isDef()) {
1444 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
1445 return false;
1446 return TRI->isSuperOrSubRegisterEq(
1447 MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
1448 }
1449 }
1450 return MOP.isImplicit() ||
1451 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
1452 }
1453
1454 static bool
canRenameUpToDef(MachineInstr & FirstMI,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1455 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
1456 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1457 const TargetRegisterInfo *TRI) {
1458 if (!FirstMI.mayStore())
1459 return false;
1460
1461 // Check if we can find an unused register which we can use to rename
1462 // the register used by the first load/store.
1463
1464 auto RegToRename = getLdStRegOp(FirstMI).getReg();
1465 // For now, we only rename if the store operand gets killed at the store.
1466 if (!getLdStRegOp(FirstMI).isKill() &&
1467 !any_of(FirstMI.operands(),
1468 [TRI, RegToRename](const MachineOperand &MOP) {
1469 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
1470 MOP.isImplicit() && MOP.isKill() &&
1471 TRI->regsOverlap(RegToRename, MOP.getReg());
1472 })) {
1473 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI);
1474 return false;
1475 }
1476
1477 bool FoundDef = false;
1478
1479 // For each instruction between FirstMI and the previous def for RegToRename,
1480 // we
1481 // * check if we can rename RegToRename in this instruction
1482 // * collect the registers used and required register classes for RegToRename.
1483 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
1484 bool IsDef) {
1485 LLVM_DEBUG(dbgs() << "Checking " << MI);
1486 // Currently we do not try to rename across frame-setup instructions.
1487 if (MI.getFlag(MachineInstr::FrameSetup)) {
1488 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1489 << "currently\n");
1490 return false;
1491 }
1492
1493 UsedInBetween.accumulate(MI);
1494
1495 // For a definition, check that we can rename the definition and exit the
1496 // loop.
1497 FoundDef = IsDef;
1498
1499 // For defs, check if we can rename the first def of RegToRename.
1500 if (FoundDef) {
1501 // For some pseudo instructions, we might not generate code in the end
1502 // (e.g. KILL) and we would end up without a correct def for the rename
1503 // register.
1504 // TODO: This might be overly conservative and we could handle those cases
1505 // in multiple ways:
1506 // 1. Insert an extra copy, to materialize the def.
1507 // 2. Skip pseudo-defs until we find an non-pseudo def.
1508 if (MI.isPseudo()) {
1509 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n");
1510 return false;
1511 }
1512
1513 for (auto &MOP : MI.operands()) {
1514 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
1515 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1516 continue;
1517 if (!canRenameMOP(MOP, TRI)) {
1518 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1519 return false;
1520 }
1521 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1522 }
1523 return true;
1524 } else {
1525 for (auto &MOP : MI.operands()) {
1526 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1527 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1528 continue;
1529
1530 if (!canRenameMOP(MOP, TRI)) {
1531 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1532 return false;
1533 }
1534 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1535 }
1536 }
1537 return true;
1538 };
1539
1540 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
1541 return false;
1542
1543 if (!FoundDef) {
1544 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
1545 return false;
1546 }
1547 return true;
1548 }
1549
1550 // We want to merge the second load into the first by rewriting the usages of
1551 // the same reg between first (incl.) and second (excl.). We don't need to care
1552 // about any insns before FirstLoad or after SecondLoad.
1553 // 1. The second load writes new value into the same reg.
1554 // - The renaming is impossible to impact later use of the reg.
1555 // - The second load always trash the value written by the first load which
1556 // means the reg must be killed before the second load.
1557 // 2. The first load must be a def for the same reg so we don't need to look
1558 // into anything before it.
canRenameUntilSecondLoad(MachineInstr & FirstLoad,MachineInstr & SecondLoad,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1559 static bool canRenameUntilSecondLoad(
1560 MachineInstr &FirstLoad, MachineInstr &SecondLoad,
1561 LiveRegUnits &UsedInBetween,
1562 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1563 const TargetRegisterInfo *TRI) {
1564 if (FirstLoad.isPseudo())
1565 return false;
1566
1567 UsedInBetween.accumulate(FirstLoad);
1568 auto RegToRename = getLdStRegOp(FirstLoad).getReg();
1569 bool Success = std::all_of(
1570 FirstLoad.getIterator(), SecondLoad.getIterator(),
1571 [&](MachineInstr &MI) {
1572 LLVM_DEBUG(dbgs() << "Checking " << MI);
1573 // Currently we do not try to rename across frame-setup instructions.
1574 if (MI.getFlag(MachineInstr::FrameSetup)) {
1575 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions "
1576 << "currently\n");
1577 return false;
1578 }
1579
1580 for (auto &MOP : MI.operands()) {
1581 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
1582 !TRI->regsOverlap(MOP.getReg(), RegToRename))
1583 continue;
1584 if (!canRenameMOP(MOP, TRI)) {
1585 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI);
1586 return false;
1587 }
1588 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
1589 }
1590
1591 return true;
1592 });
1593 return Success;
1594 }
1595
1596 // Check if we can find a physical register for renaming \p Reg. This register
1597 // must:
1598 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all
1599 // defined registers up to the point where the renamed register will be used,
1600 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed
1601 // registers in the range the rename register will be used,
1602 // * is available in all used register classes (checked using RequiredClasses).
tryToFindRegisterToRename(const MachineFunction & MF,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1603 static std::optional<MCPhysReg> tryToFindRegisterToRename(
1604 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB,
1605 LiveRegUnits &UsedInBetween,
1606 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1607 const TargetRegisterInfo *TRI) {
1608 const MachineRegisterInfo &RegInfo = MF.getRegInfo();
1609
1610 // Checks if any sub- or super-register of PR is callee saved.
1611 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
1612 return any_of(TRI->sub_and_superregs_inclusive(PR),
1613 [&MF, TRI](MCPhysReg SubOrSuper) {
1614 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
1615 });
1616 };
1617
1618 // Check if PR or one of its sub- or super-registers can be used for all
1619 // required register classes.
1620 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
1621 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
1622 return any_of(
1623 TRI->sub_and_superregs_inclusive(PR),
1624 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
1625 });
1626 };
1627
1628 auto *RegClass = TRI->getMinimalPhysRegClass(Reg);
1629 for (const MCPhysReg &PR : *RegClass) {
1630 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
1631 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
1632 CanBeUsedForAllClasses(PR)) {
1633 DefinedInBB.addReg(PR);
1634 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
1635 << "\n");
1636 return {PR};
1637 }
1638 }
1639 LLVM_DEBUG(dbgs() << "No rename register found from "
1640 << TRI->getRegClassName(RegClass) << "\n");
1641 return std::nullopt;
1642 }
1643
1644 // For store pairs: returns a register from FirstMI to the beginning of the
1645 // block that can be renamed.
1646 // For load pairs: returns a register from FirstMI to MI that can be renamed.
findRenameRegForSameLdStRegPair(std::optional<bool> MaybeCanRename,MachineInstr & FirstMI,MachineInstr & MI,Register Reg,LiveRegUnits & DefinedInBB,LiveRegUnits & UsedInBetween,SmallPtrSetImpl<const TargetRegisterClass * > & RequiredClasses,const TargetRegisterInfo * TRI)1647 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair(
1648 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI,
1649 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1650 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1651 const TargetRegisterInfo *TRI) {
1652 std::optional<MCPhysReg> RenameReg;
1653 if (!DebugCounter::shouldExecute(RegRenamingCounter))
1654 return RenameReg;
1655
1656 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
1657 MachineFunction &MF = *FirstMI.getParent()->getParent();
1658 if (!RegClass || !MF.getRegInfo().tracksLiveness())
1659 return RenameReg;
1660
1661 const bool IsLoad = FirstMI.mayLoad();
1662
1663 if (!MaybeCanRename) {
1664 if (IsLoad)
1665 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween,
1666 RequiredClasses, TRI)};
1667 else
1668 MaybeCanRename = {
1669 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1670 }
1671
1672 if (*MaybeCanRename) {
1673 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween,
1674 RequiredClasses, TRI);
1675 }
1676 return RenameReg;
1677 }
1678
1679 /// Scan the instructions looking for a load/store that can be combined with the
1680 /// current instruction into a wider equivalent or a load/store pair.
1681 MachineBasicBlock::iterator
findMatchingInsn(MachineBasicBlock::iterator I,LdStPairFlags & Flags,unsigned Limit,bool FindNarrowMerge)1682 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
1683 LdStPairFlags &Flags, unsigned Limit,
1684 bool FindNarrowMerge) {
1685 MachineBasicBlock::iterator E = I->getParent()->end();
1686 MachineBasicBlock::iterator MBBI = I;
1687 MachineBasicBlock::iterator MBBIWithRenameReg;
1688 MachineInstr &FirstMI = *I;
1689 MBBI = next_nodbg(MBBI, E);
1690
1691 bool MayLoad = FirstMI.mayLoad();
1692 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI);
1693 Register Reg = getLdStRegOp(FirstMI).getReg();
1694 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg();
1695 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm();
1696 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
1697 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
1698
1699 std::optional<bool> MaybeCanRename;
1700 if (!EnableRenaming)
1701 MaybeCanRename = {false};
1702
1703 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
1704 LiveRegUnits UsedInBetween;
1705 UsedInBetween.init(*TRI);
1706
1707 Flags.clearRenameReg();
1708
1709 // Track which register units have been modified and used between the first
1710 // insn (inclusive) and the second insn.
1711 ModifiedRegUnits.clear();
1712 UsedRegUnits.clear();
1713
1714 // Remember any instructions that read/write memory between FirstMI and MI.
1715 SmallVector<MachineInstr *, 4> MemInsns;
1716
1717 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump());
1718 for (unsigned Count = 0; MBBI != E && Count < Limit;
1719 MBBI = next_nodbg(MBBI, E)) {
1720 MachineInstr &MI = *MBBI;
1721 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump());
1722
1723 UsedInBetween.accumulate(MI);
1724
1725 // Don't count transient instructions towards the search limit since there
1726 // may be different numbers of them if e.g. debug information is present.
1727 if (!MI.isTransient())
1728 ++Count;
1729
1730 Flags.setSExtIdx(-1);
1731 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
1732 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) {
1733 assert(MI.mayLoadOrStore() && "Expected memory operation.");
1734 // If we've found another instruction with the same opcode, check to see
1735 // if the base and offset are compatible with our starting instruction.
1736 // These instructions all have scaled immediate operands, so we just
1737 // check for +1/-1. Make sure to check the new instruction offset is
1738 // actually an immediate and not a symbolic reference destined for
1739 // a relocation.
1740 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg();
1741 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
1742 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI);
1743 if (IsUnscaled != MIIsUnscaled) {
1744 // We're trying to pair instructions that differ in how they are scaled.
1745 // If FirstMI is scaled then scale the offset of MI accordingly.
1746 // Otherwise, do the opposite (i.e., make MI's offset unscaled).
1747 int MemSize = TII->getMemScale(MI);
1748 if (MIIsUnscaled) {
1749 // If the unscaled offset isn't a multiple of the MemSize, we can't
1750 // pair the operations together: bail and keep looking.
1751 if (MIOffset % MemSize) {
1752 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1753 UsedRegUnits, TRI);
1754 MemInsns.push_back(&MI);
1755 continue;
1756 }
1757 MIOffset /= MemSize;
1758 } else {
1759 MIOffset *= MemSize;
1760 }
1761 }
1762
1763 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI);
1764
1765 if (BaseReg == MIBaseReg) {
1766 // If the offset of the second ld/st is not equal to the size of the
1767 // destination register it can’t be paired with a pre-index ld/st
1768 // pair. Additionally if the base reg is used or modified the operations
1769 // can't be paired: bail and keep looking.
1770 if (IsPreLdSt) {
1771 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI);
1772 bool IsBaseRegUsed = !UsedRegUnits.available(
1773 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1774 bool IsBaseRegModified = !ModifiedRegUnits.available(
1775 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1776 // If the stored value and the address of the second instruction is
1777 // the same, it needs to be using the updated register and therefore
1778 // it must not be folded.
1779 bool IsMIRegTheSame =
1780 TRI->regsOverlap(getLdStRegOp(MI).getReg(),
1781 AArch64InstrInfo::getLdStBaseOp(MI).getReg());
1782 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
1783 IsMIRegTheSame) {
1784 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1785 UsedRegUnits, TRI);
1786 MemInsns.push_back(&MI);
1787 continue;
1788 }
1789 } else {
1790 if ((Offset != MIOffset + OffsetStride) &&
1791 (Offset + OffsetStride != MIOffset)) {
1792 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1793 UsedRegUnits, TRI);
1794 MemInsns.push_back(&MI);
1795 continue;
1796 }
1797 }
1798
1799 int MinOffset = Offset < MIOffset ? Offset : MIOffset;
1800 if (FindNarrowMerge) {
1801 // If the alignment requirements of the scaled wide load/store
1802 // instruction can't express the offset of the scaled narrow input,
1803 // bail and keep looking. For promotable zero stores, allow only when
1804 // the stored value is the same (i.e., WZR).
1805 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
1806 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
1807 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1808 UsedRegUnits, TRI);
1809 MemInsns.push_back(&MI);
1810 continue;
1811 }
1812 } else {
1813 // Pairwise instructions have a 7-bit signed offset field. Single
1814 // insns have a 12-bit unsigned offset field. If the resultant
1815 // immediate offset of merging these instructions is out of range for
1816 // a pairwise instruction, bail and keep looking.
1817 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
1818 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1819 UsedRegUnits, TRI);
1820 MemInsns.push_back(&MI);
1821 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, "
1822 << "keep looking.\n");
1823 continue;
1824 }
1825 // If the alignment requirements of the paired (scaled) instruction
1826 // can't express the offset of the unscaled input, bail and keep
1827 // looking.
1828 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
1829 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1830 UsedRegUnits, TRI);
1831 MemInsns.push_back(&MI);
1832 LLVM_DEBUG(dbgs()
1833 << "Offset doesn't fit due to alignment requirements, "
1834 << "keep looking.\n");
1835 continue;
1836 }
1837 }
1838
1839 // If the BaseReg has been modified, then we cannot do the optimization.
1840 // For example, in the following pattern
1841 // ldr x1 [x2]
1842 // ldr x2 [x3]
1843 // ldr x4 [x2, #8],
1844 // the first and third ldr cannot be converted to ldp x1, x4, [x2]
1845 if (!ModifiedRegUnits.available(BaseReg))
1846 return E;
1847
1848 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq(
1849 Reg, getLdStRegOp(MI).getReg());
1850
1851 // If the Rt of the second instruction (destination register of the
1852 // load) was not modified or used between the two instructions and none
1853 // of the instructions between the second and first alias with the
1854 // second, we can combine the second into the first.
1855 bool RtNotModified =
1856 ModifiedRegUnits.available(getLdStRegOp(MI).getReg());
1857 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg &&
1858 !UsedRegUnits.available(getLdStRegOp(MI).getReg()));
1859
1860 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n"
1861 << "Reg '" << getLdStRegOp(MI) << "' not modified: "
1862 << (RtNotModified ? "true" : "false") << "\n"
1863 << "Reg '" << getLdStRegOp(MI) << "' not used: "
1864 << (RtNotUsed ? "true" : "false") << "\n");
1865
1866 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) {
1867 // For pairs loading into the same reg, try to find a renaming
1868 // opportunity to allow the renaming of Reg between FirstMI and MI
1869 // and combine MI into FirstMI; otherwise bail and keep looking.
1870 if (SameLoadReg) {
1871 std::optional<MCPhysReg> RenameReg =
1872 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI,
1873 Reg, DefinedInBB, UsedInBetween,
1874 RequiredClasses, TRI);
1875 if (!RenameReg) {
1876 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
1877 UsedRegUnits, TRI);
1878 MemInsns.push_back(&MI);
1879 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, "
1880 << "keep looking.\n");
1881 continue;
1882 }
1883 Flags.setRenameReg(*RenameReg);
1884 }
1885
1886 Flags.setMergeForward(false);
1887 if (!SameLoadReg)
1888 Flags.clearRenameReg();
1889 return MBBI;
1890 }
1891
1892 // Likewise, if the Rt of the first instruction is not modified or used
1893 // between the two instructions and none of the instructions between the
1894 // first and the second alias with the first, we can combine the first
1895 // into the second.
1896 RtNotModified = !(
1897 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg()));
1898
1899 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n"
1900 << "Reg '" << getLdStRegOp(FirstMI)
1901 << "' not modified: "
1902 << (RtNotModified ? "true" : "false") << "\n");
1903
1904 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) {
1905 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
1906 Flags.setMergeForward(true);
1907 Flags.clearRenameReg();
1908 return MBBI;
1909 }
1910
1911 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair(
1912 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween,
1913 RequiredClasses, TRI);
1914 if (RenameReg) {
1915 Flags.setMergeForward(true);
1916 Flags.setRenameReg(*RenameReg);
1917 MBBIWithRenameReg = MBBI;
1918 }
1919 }
1920 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to "
1921 << "interference in between, keep looking.\n");
1922 }
1923 }
1924
1925 if (Flags.getRenameReg())
1926 return MBBIWithRenameReg;
1927
1928 // If the instruction wasn't a matching load or store. Stop searching if we
1929 // encounter a call instruction that might modify memory.
1930 if (MI.isCall()) {
1931 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n");
1932 return E;
1933 }
1934
1935 // Update modified / uses register units.
1936 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
1937
1938 // Otherwise, if the base register is modified, we have no match, so
1939 // return early.
1940 if (!ModifiedRegUnits.available(BaseReg)) {
1941 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n");
1942 return E;
1943 }
1944
1945 // Update list of instructions that read/write memory.
1946 if (MI.mayLoadOrStore())
1947 MemInsns.push_back(&MI);
1948 }
1949 return E;
1950 }
1951
1952 static MachineBasicBlock::iterator
maybeMoveCFI(MachineInstr & MI,MachineBasicBlock::iterator MaybeCFI)1953 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
1954 auto End = MI.getParent()->end();
1955 if (MaybeCFI == End ||
1956 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION ||
1957 !(MI.getFlag(MachineInstr::FrameSetup) ||
1958 MI.getFlag(MachineInstr::FrameDestroy)) ||
1959 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP)
1960 return End;
1961
1962 const MachineFunction &MF = *MI.getParent()->getParent();
1963 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex();
1964 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex];
1965 switch (CFI.getOperation()) {
1966 case MCCFIInstruction::OpDefCfa:
1967 case MCCFIInstruction::OpDefCfaOffset:
1968 return MaybeCFI;
1969 default:
1970 return End;
1971 }
1972 }
1973
1974 MachineBasicBlock::iterator
mergeUpdateInsn(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Update,bool IsPreIdx)1975 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
1976 MachineBasicBlock::iterator Update,
1977 bool IsPreIdx) {
1978 assert((Update->getOpcode() == AArch64::ADDXri ||
1979 Update->getOpcode() == AArch64::SUBXri) &&
1980 "Unexpected base register update instruction to merge!");
1981 MachineBasicBlock::iterator E = I->getParent()->end();
1982 MachineBasicBlock::iterator NextI = next_nodbg(I, E);
1983
1984 // If updating the SP and the following instruction is CFA offset related CFI
1985 // instruction move it after the merged instruction.
1986 MachineBasicBlock::iterator CFI =
1987 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
1988
1989 // Return the instruction following the merged instruction, which is
1990 // the instruction following our unmerged load. Unless that's the add/sub
1991 // instruction we're merging, in which case it's the one after that.
1992 if (NextI == Update)
1993 NextI = next_nodbg(NextI, E);
1994
1995 int Value = Update->getOperand(2).getImm();
1996 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
1997 "Can't merge 1 << 12 offset into pre-/post-indexed load / store");
1998 if (Update->getOpcode() == AArch64::SUBXri)
1999 Value = -Value;
2000
2001 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
2002 : getPostIndexedOpcode(I->getOpcode());
2003 MachineInstrBuilder MIB;
2004 int Scale, MinOffset, MaxOffset;
2005 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
2006 if (!AArch64InstrInfo::isPairedLdSt(*I)) {
2007 // Non-paired instruction.
2008 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2009 .add(getLdStRegOp(*Update))
2010 .add(getLdStRegOp(*I))
2011 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2012 .addImm(Value / Scale)
2013 .setMemRefs(I->memoperands())
2014 .setMIFlags(I->mergeFlagsWith(*Update));
2015 } else {
2016 // Paired instruction.
2017 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
2018 .add(getLdStRegOp(*Update))
2019 .add(getLdStRegOp(*I, 0))
2020 .add(getLdStRegOp(*I, 1))
2021 .add(AArch64InstrInfo::getLdStBaseOp(*I))
2022 .addImm(Value / Scale)
2023 .setMemRefs(I->memoperands())
2024 .setMIFlags(I->mergeFlagsWith(*Update));
2025 }
2026 if (CFI != E) {
2027 MachineBasicBlock *MBB = I->getParent();
2028 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
2029 }
2030
2031 if (IsPreIdx) {
2032 ++NumPreFolded;
2033 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store.");
2034 } else {
2035 ++NumPostFolded;
2036 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store.");
2037 }
2038 LLVM_DEBUG(dbgs() << " Replacing instructions:\n ");
2039 LLVM_DEBUG(I->print(dbgs()));
2040 LLVM_DEBUG(dbgs() << " ");
2041 LLVM_DEBUG(Update->print(dbgs()));
2042 LLVM_DEBUG(dbgs() << " with instruction:\n ");
2043 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
2044 LLVM_DEBUG(dbgs() << "\n");
2045
2046 // Erase the old instructions for the block.
2047 I->eraseFromParent();
2048 Update->eraseFromParent();
2049
2050 return NextI;
2051 }
2052
isMatchingUpdateInsn(MachineInstr & MemMI,MachineInstr & MI,unsigned BaseReg,int Offset)2053 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
2054 MachineInstr &MI,
2055 unsigned BaseReg, int Offset) {
2056 switch (MI.getOpcode()) {
2057 default:
2058 break;
2059 case AArch64::SUBXri:
2060 case AArch64::ADDXri:
2061 // Make sure it's a vanilla immediate operand, not a relocation or
2062 // anything else we can't handle.
2063 if (!MI.getOperand(2).isImm())
2064 break;
2065 // Watch out for 1 << 12 shifted value.
2066 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
2067 break;
2068
2069 // The update instruction source and destination register must be the
2070 // same as the load/store base register.
2071 if (MI.getOperand(0).getReg() != BaseReg ||
2072 MI.getOperand(1).getReg() != BaseReg)
2073 break;
2074
2075 int UpdateOffset = MI.getOperand(2).getImm();
2076 if (MI.getOpcode() == AArch64::SUBXri)
2077 UpdateOffset = -UpdateOffset;
2078
2079 // The immediate must be a multiple of the scaling factor of the pre/post
2080 // indexed instruction.
2081 int Scale, MinOffset, MaxOffset;
2082 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset);
2083 if (UpdateOffset % Scale != 0)
2084 break;
2085
2086 // Scaled offset must fit in the instruction immediate.
2087 int ScaledOffset = UpdateOffset / Scale;
2088 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset)
2089 break;
2090
2091 // If we have a non-zero Offset, we check that it matches the amount
2092 // we're adding to the register.
2093 if (!Offset || Offset == UpdateOffset)
2094 return true;
2095 break;
2096 }
2097 return false;
2098 }
2099
findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,int UnscaledOffset,unsigned Limit)2100 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
2101 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
2102 MachineBasicBlock::iterator E = I->getParent()->end();
2103 MachineInstr &MemMI = *I;
2104 MachineBasicBlock::iterator MBBI = I;
2105
2106 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2107 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() *
2108 TII->getMemScale(MemMI);
2109
2110 // Scan forward looking for post-index opportunities. Updating instructions
2111 // can't be formed if the memory instruction doesn't have the offset we're
2112 // looking for.
2113 if (MIUnscaledOffset != UnscaledOffset)
2114 return E;
2115
2116 // If the base register overlaps a source/destination register, we can't
2117 // merge the update. This does not apply to tag store instructions which
2118 // ignore the address part of the source register.
2119 // This does not apply to STGPi as well, which does not have unpredictable
2120 // behavior in this case unlike normal stores, and always performs writeback
2121 // after reading the source register value.
2122 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) {
2123 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2124 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2125 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2126 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2127 return E;
2128 }
2129 }
2130
2131 // Track which register units have been modified and used between the first
2132 // insn (inclusive) and the second insn.
2133 ModifiedRegUnits.clear();
2134 UsedRegUnits.clear();
2135 MBBI = next_nodbg(MBBI, E);
2136
2137 // We can't post-increment the stack pointer if any instruction between
2138 // the memory access (I) and the increment (MBBI) can access the memory
2139 // region defined by [SP, MBBI].
2140 const bool BaseRegSP = BaseReg == AArch64::SP;
2141 if (BaseRegSP && needsWinCFI(I->getMF())) {
2142 // FIXME: For now, we always block the optimization over SP in windows
2143 // targets as it requires to adjust the unwind/debug info, messing up
2144 // the unwind info can actually cause a miscompile.
2145 return E;
2146 }
2147
2148 for (unsigned Count = 0; MBBI != E && Count < Limit;
2149 MBBI = next_nodbg(MBBI, E)) {
2150 MachineInstr &MI = *MBBI;
2151
2152 // Don't count transient instructions towards the search limit since there
2153 // may be different numbers of them if e.g. debug information is present.
2154 if (!MI.isTransient())
2155 ++Count;
2156
2157 // If we found a match, return it.
2158 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
2159 return MBBI;
2160
2161 // Update the status of what the instruction clobbered and used.
2162 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2163
2164 // Otherwise, if the base register is used or modified, we have no match, so
2165 // return early.
2166 // If we are optimizing SP, do not allow instructions that may load or store
2167 // in between the load and the optimized value update.
2168 if (!ModifiedRegUnits.available(BaseReg) ||
2169 !UsedRegUnits.available(BaseReg) ||
2170 (BaseRegSP && MBBI->mayLoadOrStore()))
2171 return E;
2172 }
2173 return E;
2174 }
2175
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,unsigned Limit)2176 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
2177 MachineBasicBlock::iterator I, unsigned Limit) {
2178 MachineBasicBlock::iterator B = I->getParent()->begin();
2179 MachineBasicBlock::iterator E = I->getParent()->end();
2180 MachineInstr &MemMI = *I;
2181 MachineBasicBlock::iterator MBBI = I;
2182 MachineFunction &MF = *MemMI.getMF();
2183
2184 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
2185 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();
2186
2187 // If the load/store is the first instruction in the block, there's obviously
2188 // not any matching update. Ditto if the memory offset isn't zero.
2189 if (MBBI == B || Offset != 0)
2190 return E;
2191 // If the base register overlaps a destination register, we can't
2192 // merge the update.
2193 if (!isTagStore(MemMI)) {
2194 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
2195 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
2196 Register DestReg = getLdStRegOp(MemMI, i).getReg();
2197 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
2198 return E;
2199 }
2200 }
2201
2202 const bool BaseRegSP = BaseReg == AArch64::SP;
2203 if (BaseRegSP && needsWinCFI(I->getMF())) {
2204 // FIXME: For now, we always block the optimization over SP in windows
2205 // targets as it requires to adjust the unwind/debug info, messing up
2206 // the unwind info can actually cause a miscompile.
2207 return E;
2208 }
2209
2210 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2211 unsigned RedZoneSize =
2212 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction());
2213
2214 // Track which register units have been modified and used between the first
2215 // insn (inclusive) and the second insn.
2216 ModifiedRegUnits.clear();
2217 UsedRegUnits.clear();
2218 unsigned Count = 0;
2219 bool MemAcessBeforeSPPreInc = false;
2220 do {
2221 MBBI = prev_nodbg(MBBI, B);
2222 MachineInstr &MI = *MBBI;
2223
2224 // Don't count transient instructions towards the search limit since there
2225 // may be different numbers of them if e.g. debug information is present.
2226 if (!MI.isTransient())
2227 ++Count;
2228
2229 // If we found a match, return it.
2230 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) {
2231 // Check that the update value is within our red zone limit (which may be
2232 // zero).
2233 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize)
2234 return E;
2235 return MBBI;
2236 }
2237
2238 // Update the status of what the instruction clobbered and used.
2239 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
2240
2241 // Otherwise, if the base register is used or modified, we have no match, so
2242 // return early.
2243 if (!ModifiedRegUnits.available(BaseReg) ||
2244 !UsedRegUnits.available(BaseReg))
2245 return E;
2246 // Keep track if we have a memory access before an SP pre-increment, in this
2247 // case we need to validate later that the update amount respects the red
2248 // zone.
2249 if (BaseRegSP && MBBI->mayLoadOrStore())
2250 MemAcessBeforeSPPreInc = true;
2251 } while (MBBI != B && Count < Limit);
2252 return E;
2253 }
2254
tryToPromoteLoadFromStore(MachineBasicBlock::iterator & MBBI)2255 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
2256 MachineBasicBlock::iterator &MBBI) {
2257 MachineInstr &MI = *MBBI;
2258 // If this is a volatile load, don't mess with it.
2259 if (MI.hasOrderedMemoryRef())
2260 return false;
2261
2262 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy))
2263 return false;
2264
2265 // Make sure this is a reg+imm.
2266 // FIXME: It is possible to extend it to handle reg+reg cases.
2267 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm())
2268 return false;
2269
2270 // Look backward up to LdStLimit instructions.
2271 MachineBasicBlock::iterator StoreI;
2272 if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
2273 ++NumLoadsFromStoresPromoted;
2274 // Promote the load. Keeping the iterator straight is a
2275 // pain, so we let the merge routine tell us what the next instruction
2276 // is after it's done mucking about.
2277 MBBI = promoteLoadFromStore(MBBI, StoreI);
2278 return true;
2279 }
2280 return false;
2281 }
2282
2283 // Merge adjacent zero stores into a wider store.
tryToMergeZeroStInst(MachineBasicBlock::iterator & MBBI)2284 bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
2285 MachineBasicBlock::iterator &MBBI) {
2286 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
2287 MachineInstr &MI = *MBBI;
2288 MachineBasicBlock::iterator E = MI.getParent()->end();
2289
2290 if (!TII->isCandidateToMergeOrPair(MI))
2291 return false;
2292
2293 // Look ahead up to LdStLimit instructions for a mergable instruction.
2294 LdStPairFlags Flags;
2295 MachineBasicBlock::iterator MergeMI =
2296 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
2297 if (MergeMI != E) {
2298 ++NumZeroStoresPromoted;
2299
2300 // Keeping the iterator straight is a pain, so we let the merge routine tell
2301 // us what the next instruction is after it's done mucking about.
2302 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
2303 return true;
2304 }
2305 return false;
2306 }
2307
2308 // Find loads and stores that can be merged into a single load or store pair
2309 // instruction.
tryToPairLdStInst(MachineBasicBlock::iterator & MBBI)2310 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
2311 MachineInstr &MI = *MBBI;
2312 MachineBasicBlock::iterator E = MI.getParent()->end();
2313
2314 if (!TII->isCandidateToMergeOrPair(MI))
2315 return false;
2316
2317 // If disable-ldp feature is opted, do not emit ldp.
2318 if (MI.mayLoad() && Subtarget->hasDisableLdp())
2319 return false;
2320
2321 // If disable-stp feature is opted, do not emit stp.
2322 if (MI.mayStore() && Subtarget->hasDisableStp())
2323 return false;
2324
2325 // Early exit if the offset is not possible to match. (6 bits of positive
2326 // range, plus allow an extra one in case we find a later insn that matches
2327 // with Offset-1)
2328 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI);
2329 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm();
2330 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
2331 // Allow one more for offset.
2332 if (Offset > 0)
2333 Offset -= OffsetStride;
2334 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
2335 return false;
2336
2337 // Look ahead up to LdStLimit instructions for a pairable instruction.
2338 LdStPairFlags Flags;
2339 MachineBasicBlock::iterator Paired =
2340 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
2341 if (Paired != E) {
2342 // Keeping the iterator straight is a pain, so we let the merge routine tell
2343 // us what the next instruction is after it's done mucking about.
2344 auto Prev = std::prev(MBBI);
2345
2346 // Fetch the memoperand of the load/store that is a candidate for
2347 // combination.
2348 MachineMemOperand *MemOp =
2349 MI.memoperands_empty() ? nullptr : MI.memoperands().front();
2350
2351 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check
2352 // that the alignment of the source pointer is at least double the alignment
2353 // of the type.
2354 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) ||
2355 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) {
2356 // If there is no size/align information, cancel the transformation.
2357 if (!MemOp || !MemOp->getMemoryType().isValid()) {
2358 NumFailedAlignmentCheck++;
2359 return false;
2360 }
2361
2362 // Get the needed alignments to check them if
2363 // ldp-aligned-only/stp-aligned-only features are opted.
2364 uint64_t MemAlignment = MemOp->getAlign().value();
2365 uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
2366
2367 if (MemAlignment < 2 * TypeAlignment) {
2368 NumFailedAlignmentCheck++;
2369 return false;
2370 }
2371 }
2372
2373 ++NumPairCreated;
2374 if (TII->hasUnscaledLdStOffset(MI))
2375 ++NumUnscaledPairCreated;
2376
2377 MBBI = mergePairedInsns(MBBI, Paired, Flags);
2378 // Collect liveness info for instructions between Prev and the new position
2379 // MBBI.
2380 for (auto I = std::next(Prev); I != MBBI; I++)
2381 updateDefinedRegisters(*I, DefinedInBB, TRI);
2382
2383 return true;
2384 }
2385 return false;
2386 }
2387
tryToMergeLdStUpdate(MachineBasicBlock::iterator & MBBI)2388 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
2389 (MachineBasicBlock::iterator &MBBI) {
2390 MachineInstr &MI = *MBBI;
2391 MachineBasicBlock::iterator E = MI.getParent()->end();
2392 MachineBasicBlock::iterator Update;
2393
2394 // Look forward to try to form a post-index instruction. For example,
2395 // ldr x0, [x20]
2396 // add x20, x20, #32
2397 // merged into:
2398 // ldr x0, [x20], #32
2399 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
2400 if (Update != E) {
2401 // Merge the update into the ld/st.
2402 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
2403 return true;
2404 }
2405
2406 // Don't know how to handle unscaled pre/post-index versions below, so bail.
2407 if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
2408 return false;
2409
2410 // Look back to try to find a pre-index instruction. For example,
2411 // add x0, x0, #8
2412 // ldr x1, [x0]
2413 // merged into:
2414 // ldr x1, [x0, #8]!
2415 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
2416 if (Update != E) {
2417 // Merge the update into the ld/st.
2418 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2419 return true;
2420 }
2421
2422 // The immediate in the load/store is scaled by the size of the memory
2423 // operation. The immediate in the add we're looking for,
2424 // however, is not, so adjust here.
2425 int UnscaledOffset =
2426 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
2427
2428 // Look forward to try to find a pre-index instruction. For example,
2429 // ldr x1, [x0, #64]
2430 // add x0, x0, #64
2431 // merged into:
2432 // ldr x1, [x0, #64]!
2433 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
2434 if (Update != E) {
2435 // Merge the update into the ld/st.
2436 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
2437 return true;
2438 }
2439
2440 return false;
2441 }
2442
optimizeBlock(MachineBasicBlock & MBB,bool EnableNarrowZeroStOpt)2443 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
2444 bool EnableNarrowZeroStOpt) {
2445
2446 bool Modified = false;
2447 // Four tranformations to do here:
2448 // 1) Find loads that directly read from stores and promote them by
2449 // replacing with mov instructions. If the store is wider than the load,
2450 // the load will be replaced with a bitfield extract.
2451 // e.g.,
2452 // str w1, [x0, #4]
2453 // ldrh w2, [x0, #6]
2454 // ; becomes
2455 // str w1, [x0, #4]
2456 // lsr w2, w1, #16
2457 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2458 MBBI != E;) {
2459 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI))
2460 Modified = true;
2461 else
2462 ++MBBI;
2463 }
2464 // 2) Merge adjacent zero stores into a wider store.
2465 // e.g.,
2466 // strh wzr, [x0]
2467 // strh wzr, [x0, #2]
2468 // ; becomes
2469 // str wzr, [x0]
2470 // e.g.,
2471 // str wzr, [x0]
2472 // str wzr, [x0, #4]
2473 // ; becomes
2474 // str xzr, [x0]
2475 if (EnableNarrowZeroStOpt)
2476 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2477 MBBI != E;) {
2478 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI))
2479 Modified = true;
2480 else
2481 ++MBBI;
2482 }
2483 // 3) Find loads and stores that can be merged into a single load or store
2484 // pair instruction.
2485 // e.g.,
2486 // ldr x0, [x2]
2487 // ldr x1, [x2, #8]
2488 // ; becomes
2489 // ldp x0, x1, [x2]
2490
2491 if (MBB.getParent()->getRegInfo().tracksLiveness()) {
2492 DefinedInBB.clear();
2493 DefinedInBB.addLiveIns(MBB);
2494 }
2495
2496 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2497 MBBI != E;) {
2498 // Track currently live registers up to this point, to help with
2499 // searching for a rename register on demand.
2500 updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
2501 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
2502 Modified = true;
2503 else
2504 ++MBBI;
2505 }
2506 // 4) Find base register updates that can be merged into the load or store
2507 // as a base-reg writeback.
2508 // e.g.,
2509 // ldr x0, [x2]
2510 // add x2, x2, #4
2511 // ; becomes
2512 // ldr x0, [x2], #4
2513 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
2514 MBBI != E;) {
2515 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI))
2516 Modified = true;
2517 else
2518 ++MBBI;
2519 }
2520
2521 return Modified;
2522 }
2523
runOnMachineFunction(MachineFunction & Fn)2524 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
2525 if (skipFunction(Fn.getFunction()))
2526 return false;
2527
2528 Subtarget = &Fn.getSubtarget<AArch64Subtarget>();
2529 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
2530 TRI = Subtarget->getRegisterInfo();
2531 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2532
2533 // Resize the modified and used register unit trackers. We do this once
2534 // per function and then clear the register units each time we optimize a load
2535 // or store.
2536 ModifiedRegUnits.init(*TRI);
2537 UsedRegUnits.init(*TRI);
2538 DefinedInBB.init(*TRI);
2539
2540 bool Modified = false;
2541 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
2542 for (auto &MBB : Fn) {
2543 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
2544 Modified |= M;
2545 }
2546
2547 return Modified;
2548 }
2549
2550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
2551 // stores near one another? Note: The pre-RA instruction scheduler already has
2552 // hooks to try and schedule pairable loads/stores together to improve pairing
2553 // opportunities. Thus, pre-RA pairing pass may not be worth the effort.
2554
2555 // FIXME: When pairing store instructions it's very possible for this pass to
2556 // hoist a store with a KILL marker above another use (without a KILL marker).
2557 // The resulting IR is invalid, but nothing uses the KILL markers after this
2558 // pass, so it's never caused a problem in practice.
2559
2560 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
2561 /// load / store optimization pass.
createAArch64LoadStoreOptimizationPass()2562 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
2563 return new AArch64LoadStoreOpt();
2564 }
2565