1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass that performs load / store related peephole 10 // optimizations. This pass should be run after register allocation. 11 // 12 // The pass runs after the PrologEpilogInserter where we emit the CFI 13 // instructions. In order to preserve the correctness of the unwind informaiton, 14 // the pass should not change the order of any two instructions, one of which 15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix 16 // to unwind information. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "AArch64InstrInfo.h" 21 #include "AArch64MachineFunctionInfo.h" 22 #include "AArch64Subtarget.h" 23 #include "MCTargetDesc/AArch64AddressingModes.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineFunctionPass.h" 32 #include "llvm/CodeGen/MachineInstr.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/TargetRegisterInfo.h" 37 #include "llvm/IR/DebugLoc.h" 38 #include "llvm/MC/MCAsmInfo.h" 39 #include "llvm/MC/MCDwarf.h" 40 #include "llvm/MC/MCRegisterInfo.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/DebugCounter.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <functional> 50 #include <iterator> 51 #include <limits> 52 #include <optional> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "aarch64-ldst-opt" 57 58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 59 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 60 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 61 STATISTIC(NumUnscaledPairCreated, 62 "Number of load/store from unscaled generated"); 63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 65 STATISTIC(NumConstOffsetFolded, 66 "Number of const offset of index address folded"); 67 68 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", 69 "Controls which pairs are considered for renaming"); 70 71 // The LdStLimit limits how far we search for load/store pairs. 72 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 73 cl::init(20), cl::Hidden); 74 75 // The UpdateLimit limits how far we search for update instructions when we form 76 // pre-/post-index instructions. 77 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 78 cl::Hidden); 79 80 // The LdStConstLimit limits how far we search for const offset instructions 81 // when we form index address load/store instructions. 82 static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit", 83 cl::init(10), cl::Hidden); 84 85 // Enable register renaming to find additional store pairing opportunities. 86 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming", 87 cl::init(true), cl::Hidden); 88 89 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 90 91 namespace { 92 93 using LdStPairFlags = struct LdStPairFlags { 94 // If a matching instruction is found, MergeForward is set to true if the 95 // merge is to remove the first instruction and replace the second with 96 // a pair-wise insn, and false if the reverse is true. 97 bool MergeForward = false; 98 99 // SExtIdx gives the index of the result of the load pair that must be 100 // extended. The value of SExtIdx assumes that the paired load produces the 101 // value in this order: (I, returned iterator), i.e., -1 means no value has 102 // to be extended, 0 means I, and 1 means the returned iterator. 103 int SExtIdx = -1; 104 105 // If not none, RenameReg can be used to rename the result register of the 106 // first store in a pair. Currently this only works when merging stores 107 // forward. 108 std::optional<MCPhysReg> RenameReg; 109 110 LdStPairFlags() = default; 111 112 void setMergeForward(bool V = true) { MergeForward = V; } 113 bool getMergeForward() const { return MergeForward; } 114 115 void setSExtIdx(int V) { SExtIdx = V; } 116 int getSExtIdx() const { return SExtIdx; } 117 118 void setRenameReg(MCPhysReg R) { RenameReg = R; } 119 void clearRenameReg() { RenameReg = std::nullopt; } 120 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; } 121 }; 122 123 struct AArch64LoadStoreOpt : public MachineFunctionPass { 124 static char ID; 125 126 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 127 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 128 } 129 130 AliasAnalysis *AA; 131 const AArch64InstrInfo *TII; 132 const TargetRegisterInfo *TRI; 133 const AArch64Subtarget *Subtarget; 134 135 // Track which register units have been modified and used. 136 LiveRegUnits ModifiedRegUnits, UsedRegUnits; 137 LiveRegUnits DefinedInBB; 138 139 void getAnalysisUsage(AnalysisUsage &AU) const override { 140 AU.addRequired<AAResultsWrapperPass>(); 141 MachineFunctionPass::getAnalysisUsage(AU); 142 } 143 144 // Scan the instructions looking for a load/store that can be combined 145 // with the current instruction into a load/store pair. 146 // Return the matching instruction if one is found, else MBB->end(). 147 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 148 LdStPairFlags &Flags, 149 unsigned Limit, 150 bool FindNarrowMerge); 151 152 // Scan the instructions looking for a store that writes to the address from 153 // which the current load instruction reads. Return true if one is found. 154 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 155 MachineBasicBlock::iterator &StoreI); 156 157 // Merge the two instructions indicated into a wider narrow store instruction. 158 MachineBasicBlock::iterator 159 mergeNarrowZeroStores(MachineBasicBlock::iterator I, 160 MachineBasicBlock::iterator MergeMI, 161 const LdStPairFlags &Flags); 162 163 // Merge the two instructions indicated into a single pair-wise instruction. 164 MachineBasicBlock::iterator 165 mergePairedInsns(MachineBasicBlock::iterator I, 166 MachineBasicBlock::iterator Paired, 167 const LdStPairFlags &Flags); 168 169 // Promote the load that reads directly from the address stored to. 170 MachineBasicBlock::iterator 171 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 172 MachineBasicBlock::iterator StoreI); 173 174 // Scan the instruction list to find a base register update that can 175 // be combined with the current instruction (a load or store) using 176 // pre or post indexed addressing with writeback. Scan forwards. 177 MachineBasicBlock::iterator 178 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 179 int UnscaledOffset, unsigned Limit); 180 181 // Scan the instruction list to find a register assigned with a const 182 // value that can be combined with the current instruction (a load or store) 183 // using base addressing with writeback. Scan forwards. 184 MachineBasicBlock::iterator 185 findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit, 186 unsigned &Offset); 187 188 // Scan the instruction list to find a base register update that can 189 // be combined with the current instruction (a load or store) using 190 // pre or post indexed addressing with writeback. Scan backwards. 191 MachineBasicBlock::iterator 192 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 193 194 // Find an instruction that updates the base register of the ld/st 195 // instruction. 196 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, 197 unsigned BaseReg, int Offset); 198 199 bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI, 200 unsigned IndexReg, unsigned &Offset); 201 202 // Merge a pre- or post-index base register update into a ld/st instruction. 203 MachineBasicBlock::iterator 204 mergeUpdateInsn(MachineBasicBlock::iterator I, 205 MachineBasicBlock::iterator Update, bool IsPreIdx); 206 207 MachineBasicBlock::iterator 208 mergeConstOffsetInsn(MachineBasicBlock::iterator I, 209 MachineBasicBlock::iterator Update, unsigned Offset, 210 int Scale); 211 212 // Find and merge zero store instructions. 213 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); 214 215 // Find and pair ldr/str instructions. 216 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 217 218 // Find and promote load instructions which read directly from store. 219 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 220 221 // Find and merge a base register updates before or after a ld/st instruction. 222 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); 223 224 // Find and merge a index ldr/st instructions into a base ld/st instruction. 225 bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale); 226 227 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); 228 229 bool runOnMachineFunction(MachineFunction &Fn) override; 230 231 MachineFunctionProperties getRequiredProperties() const override { 232 return MachineFunctionProperties().set( 233 MachineFunctionProperties::Property::NoVRegs); 234 } 235 236 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } 237 }; 238 239 char AArch64LoadStoreOpt::ID = 0; 240 241 } // end anonymous namespace 242 243 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 244 AARCH64_LOAD_STORE_OPT_NAME, false, false) 245 246 static bool isNarrowStore(unsigned Opc) { 247 switch (Opc) { 248 default: 249 return false; 250 case AArch64::STRBBui: 251 case AArch64::STURBBi: 252 case AArch64::STRHHui: 253 case AArch64::STURHHi: 254 return true; 255 } 256 } 257 258 // These instruction set memory tag and either keep memory contents unchanged or 259 // set it to zero, ignoring the address part of the source register. 260 static bool isTagStore(const MachineInstr &MI) { 261 switch (MI.getOpcode()) { 262 default: 263 return false; 264 case AArch64::STGi: 265 case AArch64::STZGi: 266 case AArch64::ST2Gi: 267 case AArch64::STZ2Gi: 268 return true; 269 } 270 } 271 272 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 273 bool *IsValidLdStrOpc = nullptr) { 274 if (IsValidLdStrOpc) 275 *IsValidLdStrOpc = true; 276 switch (Opc) { 277 default: 278 if (IsValidLdStrOpc) 279 *IsValidLdStrOpc = false; 280 return std::numeric_limits<unsigned>::max(); 281 case AArch64::STRDui: 282 case AArch64::STURDi: 283 case AArch64::STRDpre: 284 case AArch64::STRQui: 285 case AArch64::STURQi: 286 case AArch64::STRQpre: 287 case AArch64::STRBBui: 288 case AArch64::STURBBi: 289 case AArch64::STRHHui: 290 case AArch64::STURHHi: 291 case AArch64::STRWui: 292 case AArch64::STRWpre: 293 case AArch64::STURWi: 294 case AArch64::STRXui: 295 case AArch64::STRXpre: 296 case AArch64::STURXi: 297 case AArch64::LDRDui: 298 case AArch64::LDURDi: 299 case AArch64::LDRDpre: 300 case AArch64::LDRQui: 301 case AArch64::LDURQi: 302 case AArch64::LDRQpre: 303 case AArch64::LDRWui: 304 case AArch64::LDURWi: 305 case AArch64::LDRWpre: 306 case AArch64::LDRXui: 307 case AArch64::LDURXi: 308 case AArch64::LDRXpre: 309 case AArch64::STRSui: 310 case AArch64::STURSi: 311 case AArch64::STRSpre: 312 case AArch64::LDRSui: 313 case AArch64::LDURSi: 314 case AArch64::LDRSpre: 315 return Opc; 316 case AArch64::LDRSWui: 317 return AArch64::LDRWui; 318 case AArch64::LDURSWi: 319 return AArch64::LDURWi; 320 case AArch64::LDRSWpre: 321 return AArch64::LDRWpre; 322 } 323 } 324 325 static unsigned getMatchingWideOpcode(unsigned Opc) { 326 switch (Opc) { 327 default: 328 llvm_unreachable("Opcode has no wide equivalent!"); 329 case AArch64::STRBBui: 330 return AArch64::STRHHui; 331 case AArch64::STRHHui: 332 return AArch64::STRWui; 333 case AArch64::STURBBi: 334 return AArch64::STURHHi; 335 case AArch64::STURHHi: 336 return AArch64::STURWi; 337 case AArch64::STURWi: 338 return AArch64::STURXi; 339 case AArch64::STRWui: 340 return AArch64::STRXui; 341 } 342 } 343 344 static unsigned getMatchingPairOpcode(unsigned Opc) { 345 switch (Opc) { 346 default: 347 llvm_unreachable("Opcode has no pairwise equivalent!"); 348 case AArch64::STRSui: 349 case AArch64::STURSi: 350 return AArch64::STPSi; 351 case AArch64::STRSpre: 352 return AArch64::STPSpre; 353 case AArch64::STRDui: 354 case AArch64::STURDi: 355 return AArch64::STPDi; 356 case AArch64::STRDpre: 357 return AArch64::STPDpre; 358 case AArch64::STRQui: 359 case AArch64::STURQi: 360 return AArch64::STPQi; 361 case AArch64::STRQpre: 362 return AArch64::STPQpre; 363 case AArch64::STRWui: 364 case AArch64::STURWi: 365 return AArch64::STPWi; 366 case AArch64::STRWpre: 367 return AArch64::STPWpre; 368 case AArch64::STRXui: 369 case AArch64::STURXi: 370 return AArch64::STPXi; 371 case AArch64::STRXpre: 372 return AArch64::STPXpre; 373 case AArch64::LDRSui: 374 case AArch64::LDURSi: 375 return AArch64::LDPSi; 376 case AArch64::LDRSpre: 377 return AArch64::LDPSpre; 378 case AArch64::LDRDui: 379 case AArch64::LDURDi: 380 return AArch64::LDPDi; 381 case AArch64::LDRDpre: 382 return AArch64::LDPDpre; 383 case AArch64::LDRQui: 384 case AArch64::LDURQi: 385 return AArch64::LDPQi; 386 case AArch64::LDRQpre: 387 return AArch64::LDPQpre; 388 case AArch64::LDRWui: 389 case AArch64::LDURWi: 390 return AArch64::LDPWi; 391 case AArch64::LDRWpre: 392 return AArch64::LDPWpre; 393 case AArch64::LDRXui: 394 case AArch64::LDURXi: 395 return AArch64::LDPXi; 396 case AArch64::LDRXpre: 397 return AArch64::LDPXpre; 398 case AArch64::LDRSWui: 399 case AArch64::LDURSWi: 400 return AArch64::LDPSWi; 401 case AArch64::LDRSWpre: 402 return AArch64::LDPSWpre; 403 } 404 } 405 406 static unsigned isMatchingStore(MachineInstr &LoadInst, 407 MachineInstr &StoreInst) { 408 unsigned LdOpc = LoadInst.getOpcode(); 409 unsigned StOpc = StoreInst.getOpcode(); 410 switch (LdOpc) { 411 default: 412 llvm_unreachable("Unsupported load instruction!"); 413 case AArch64::LDRBBui: 414 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 415 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 416 case AArch64::LDURBBi: 417 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 418 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 419 case AArch64::LDRHHui: 420 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 421 StOpc == AArch64::STRXui; 422 case AArch64::LDURHHi: 423 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 424 StOpc == AArch64::STURXi; 425 case AArch64::LDRWui: 426 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 427 case AArch64::LDURWi: 428 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 429 case AArch64::LDRXui: 430 return StOpc == AArch64::STRXui; 431 case AArch64::LDURXi: 432 return StOpc == AArch64::STURXi; 433 } 434 } 435 436 static unsigned getPreIndexedOpcode(unsigned Opc) { 437 // FIXME: We don't currently support creating pre-indexed loads/stores when 438 // the load or store is the unscaled version. If we decide to perform such an 439 // optimization in the future the cases for the unscaled loads/stores will 440 // need to be added here. 441 switch (Opc) { 442 default: 443 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 444 case AArch64::STRSui: 445 return AArch64::STRSpre; 446 case AArch64::STRDui: 447 return AArch64::STRDpre; 448 case AArch64::STRQui: 449 return AArch64::STRQpre; 450 case AArch64::STRBBui: 451 return AArch64::STRBBpre; 452 case AArch64::STRHHui: 453 return AArch64::STRHHpre; 454 case AArch64::STRWui: 455 return AArch64::STRWpre; 456 case AArch64::STRXui: 457 return AArch64::STRXpre; 458 case AArch64::LDRSui: 459 return AArch64::LDRSpre; 460 case AArch64::LDRDui: 461 return AArch64::LDRDpre; 462 case AArch64::LDRQui: 463 return AArch64::LDRQpre; 464 case AArch64::LDRBBui: 465 return AArch64::LDRBBpre; 466 case AArch64::LDRHHui: 467 return AArch64::LDRHHpre; 468 case AArch64::LDRWui: 469 return AArch64::LDRWpre; 470 case AArch64::LDRXui: 471 return AArch64::LDRXpre; 472 case AArch64::LDRSWui: 473 return AArch64::LDRSWpre; 474 case AArch64::LDPSi: 475 return AArch64::LDPSpre; 476 case AArch64::LDPSWi: 477 return AArch64::LDPSWpre; 478 case AArch64::LDPDi: 479 return AArch64::LDPDpre; 480 case AArch64::LDPQi: 481 return AArch64::LDPQpre; 482 case AArch64::LDPWi: 483 return AArch64::LDPWpre; 484 case AArch64::LDPXi: 485 return AArch64::LDPXpre; 486 case AArch64::STPSi: 487 return AArch64::STPSpre; 488 case AArch64::STPDi: 489 return AArch64::STPDpre; 490 case AArch64::STPQi: 491 return AArch64::STPQpre; 492 case AArch64::STPWi: 493 return AArch64::STPWpre; 494 case AArch64::STPXi: 495 return AArch64::STPXpre; 496 case AArch64::STGi: 497 return AArch64::STGPreIndex; 498 case AArch64::STZGi: 499 return AArch64::STZGPreIndex; 500 case AArch64::ST2Gi: 501 return AArch64::ST2GPreIndex; 502 case AArch64::STZ2Gi: 503 return AArch64::STZ2GPreIndex; 504 case AArch64::STGPi: 505 return AArch64::STGPpre; 506 } 507 } 508 509 static unsigned getBaseAddressOpcode(unsigned Opc) { 510 // TODO: Add more index address loads/stores. 511 switch (Opc) { 512 default: 513 llvm_unreachable("Opcode has no base address equivalent!"); 514 case AArch64::LDRBBroX: 515 return AArch64::LDRBBui; 516 } 517 } 518 519 static unsigned getPostIndexedOpcode(unsigned Opc) { 520 switch (Opc) { 521 default: 522 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 523 case AArch64::STRSui: 524 case AArch64::STURSi: 525 return AArch64::STRSpost; 526 case AArch64::STRDui: 527 case AArch64::STURDi: 528 return AArch64::STRDpost; 529 case AArch64::STRQui: 530 case AArch64::STURQi: 531 return AArch64::STRQpost; 532 case AArch64::STRBBui: 533 return AArch64::STRBBpost; 534 case AArch64::STRHHui: 535 return AArch64::STRHHpost; 536 case AArch64::STRWui: 537 case AArch64::STURWi: 538 return AArch64::STRWpost; 539 case AArch64::STRXui: 540 case AArch64::STURXi: 541 return AArch64::STRXpost; 542 case AArch64::LDRSui: 543 case AArch64::LDURSi: 544 return AArch64::LDRSpost; 545 case AArch64::LDRDui: 546 case AArch64::LDURDi: 547 return AArch64::LDRDpost; 548 case AArch64::LDRQui: 549 case AArch64::LDURQi: 550 return AArch64::LDRQpost; 551 case AArch64::LDRBBui: 552 return AArch64::LDRBBpost; 553 case AArch64::LDRHHui: 554 return AArch64::LDRHHpost; 555 case AArch64::LDRWui: 556 case AArch64::LDURWi: 557 return AArch64::LDRWpost; 558 case AArch64::LDRXui: 559 case AArch64::LDURXi: 560 return AArch64::LDRXpost; 561 case AArch64::LDRSWui: 562 return AArch64::LDRSWpost; 563 case AArch64::LDPSi: 564 return AArch64::LDPSpost; 565 case AArch64::LDPSWi: 566 return AArch64::LDPSWpost; 567 case AArch64::LDPDi: 568 return AArch64::LDPDpost; 569 case AArch64::LDPQi: 570 return AArch64::LDPQpost; 571 case AArch64::LDPWi: 572 return AArch64::LDPWpost; 573 case AArch64::LDPXi: 574 return AArch64::LDPXpost; 575 case AArch64::STPSi: 576 return AArch64::STPSpost; 577 case AArch64::STPDi: 578 return AArch64::STPDpost; 579 case AArch64::STPQi: 580 return AArch64::STPQpost; 581 case AArch64::STPWi: 582 return AArch64::STPWpost; 583 case AArch64::STPXi: 584 return AArch64::STPXpost; 585 case AArch64::STGi: 586 return AArch64::STGPostIndex; 587 case AArch64::STZGi: 588 return AArch64::STZGPostIndex; 589 case AArch64::ST2Gi: 590 return AArch64::ST2GPostIndex; 591 case AArch64::STZ2Gi: 592 return AArch64::STZ2GPostIndex; 593 case AArch64::STGPi: 594 return AArch64::STGPpost; 595 } 596 } 597 598 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { 599 600 unsigned OpcA = FirstMI.getOpcode(); 601 unsigned OpcB = MI.getOpcode(); 602 603 switch (OpcA) { 604 default: 605 return false; 606 case AArch64::STRSpre: 607 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi); 608 case AArch64::STRDpre: 609 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi); 610 case AArch64::STRQpre: 611 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi); 612 case AArch64::STRWpre: 613 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi); 614 case AArch64::STRXpre: 615 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi); 616 case AArch64::LDRSpre: 617 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi); 618 case AArch64::LDRDpre: 619 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi); 620 case AArch64::LDRQpre: 621 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi); 622 case AArch64::LDRWpre: 623 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi); 624 case AArch64::LDRXpre: 625 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi); 626 case AArch64::LDRSWpre: 627 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi); 628 } 629 } 630 631 // Returns the scale and offset range of pre/post indexed variants of MI. 632 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, 633 int &MinOffset, int &MaxOffset) { 634 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); 635 bool IsTagStore = isTagStore(MI); 636 // ST*G and all paired ldst have the same scale in pre/post-indexed variants 637 // as in the "unsigned offset" variant. 638 // All other pre/post indexed ldst instructions are unscaled. 639 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; 640 641 if (IsPaired) { 642 MinOffset = -64; 643 MaxOffset = 63; 644 } else { 645 MinOffset = -256; 646 MaxOffset = 255; 647 } 648 } 649 650 static MachineOperand &getLdStRegOp(MachineInstr &MI, 651 unsigned PairedRegOp = 0) { 652 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 653 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); 654 if (IsPreLdSt) 655 PairedRegOp += 1; 656 unsigned Idx = 657 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; 658 return MI.getOperand(Idx); 659 } 660 661 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, 662 MachineInstr &StoreInst, 663 const AArch64InstrInfo *TII) { 664 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 665 int LoadSize = TII->getMemScale(LoadInst); 666 int StoreSize = TII->getMemScale(StoreInst); 667 int UnscaledStOffset = 668 TII->hasUnscaledLdStOffset(StoreInst) 669 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() 670 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; 671 int UnscaledLdOffset = 672 TII->hasUnscaledLdStOffset(LoadInst) 673 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() 674 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; 675 return (UnscaledStOffset <= UnscaledLdOffset) && 676 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 677 } 678 679 static bool isPromotableZeroStoreInst(MachineInstr &MI) { 680 unsigned Opc = MI.getOpcode(); 681 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || 682 isNarrowStore(Opc)) && 683 getLdStRegOp(MI).getReg() == AArch64::WZR; 684 } 685 686 static bool isPromotableLoadFromStore(MachineInstr &MI) { 687 switch (MI.getOpcode()) { 688 default: 689 return false; 690 // Scaled instructions. 691 case AArch64::LDRBBui: 692 case AArch64::LDRHHui: 693 case AArch64::LDRWui: 694 case AArch64::LDRXui: 695 // Unscaled instructions. 696 case AArch64::LDURBBi: 697 case AArch64::LDURHHi: 698 case AArch64::LDURWi: 699 case AArch64::LDURXi: 700 return true; 701 } 702 } 703 704 static bool isMergeableLdStUpdate(MachineInstr &MI) { 705 unsigned Opc = MI.getOpcode(); 706 switch (Opc) { 707 default: 708 return false; 709 // Scaled instructions. 710 case AArch64::STRSui: 711 case AArch64::STRDui: 712 case AArch64::STRQui: 713 case AArch64::STRXui: 714 case AArch64::STRWui: 715 case AArch64::STRHHui: 716 case AArch64::STRBBui: 717 case AArch64::LDRSui: 718 case AArch64::LDRDui: 719 case AArch64::LDRQui: 720 case AArch64::LDRXui: 721 case AArch64::LDRWui: 722 case AArch64::LDRHHui: 723 case AArch64::LDRBBui: 724 case AArch64::STGi: 725 case AArch64::STZGi: 726 case AArch64::ST2Gi: 727 case AArch64::STZ2Gi: 728 case AArch64::STGPi: 729 // Unscaled instructions. 730 case AArch64::STURSi: 731 case AArch64::STURDi: 732 case AArch64::STURQi: 733 case AArch64::STURWi: 734 case AArch64::STURXi: 735 case AArch64::LDURSi: 736 case AArch64::LDURDi: 737 case AArch64::LDURQi: 738 case AArch64::LDURWi: 739 case AArch64::LDURXi: 740 // Paired instructions. 741 case AArch64::LDPSi: 742 case AArch64::LDPSWi: 743 case AArch64::LDPDi: 744 case AArch64::LDPQi: 745 case AArch64::LDPWi: 746 case AArch64::LDPXi: 747 case AArch64::STPSi: 748 case AArch64::STPDi: 749 case AArch64::STPQi: 750 case AArch64::STPWi: 751 case AArch64::STPXi: 752 // Make sure this is a reg+imm (as opposed to an address reloc). 753 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 754 return false; 755 756 return true; 757 } 758 } 759 760 // Make sure this is a reg+reg Ld/St 761 static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) { 762 unsigned Opc = MI.getOpcode(); 763 switch (Opc) { 764 default: 765 return false; 766 // Scaled instructions. 767 // TODO: Add more index address loads/stores. 768 case AArch64::LDRBBroX: 769 Scale = 1; 770 return true; 771 } 772 } 773 774 static bool isRewritableImplicitDef(unsigned Opc) { 775 switch (Opc) { 776 default: 777 return false; 778 case AArch64::ORRWrs: 779 case AArch64::ADDWri: 780 return true; 781 } 782 } 783 784 MachineBasicBlock::iterator 785 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, 786 MachineBasicBlock::iterator MergeMI, 787 const LdStPairFlags &Flags) { 788 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && 789 "Expected promotable zero stores."); 790 791 MachineBasicBlock::iterator E = I->getParent()->end(); 792 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 793 // If NextI is the second of the two instructions to be merged, we need 794 // to skip one further. Either way we merge will invalidate the iterator, 795 // and we don't need to scan the new instruction, as it's a pairwise 796 // instruction, which we're not considering for further action anyway. 797 if (NextI == MergeMI) 798 NextI = next_nodbg(NextI, E); 799 800 unsigned Opc = I->getOpcode(); 801 unsigned MergeMIOpc = MergeMI->getOpcode(); 802 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc); 803 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc); 804 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1; 805 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1; 806 807 bool MergeForward = Flags.getMergeForward(); 808 // Insert our new paired instruction after whichever of the paired 809 // instructions MergeForward indicates. 810 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 811 // Also based on MergeForward is from where we copy the base register operand 812 // so we get the flags compatible with the input code. 813 const MachineOperand &BaseRegOp = 814 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) 815 : AArch64InstrInfo::getLdStBaseOp(*I); 816 817 // Which register is Rt and which is Rt2 depends on the offset order. 818 int64_t IOffsetInBytes = 819 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride; 820 int64_t MIOffsetInBytes = 821 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() * 822 MergeMIOffsetStride; 823 // Select final offset based on the offset order. 824 int64_t OffsetImm; 825 if (IOffsetInBytes > MIOffsetInBytes) 826 OffsetImm = MIOffsetInBytes; 827 else 828 OffsetImm = IOffsetInBytes; 829 830 int NewOpcode = getMatchingWideOpcode(Opc); 831 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode); 832 833 // Adjust final offset if the result opcode is a scaled store. 834 if (FinalIsScaled) { 835 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1; 836 assert(((OffsetImm % NewOffsetStride) == 0) && 837 "Offset should be a multiple of the store memory scale"); 838 OffsetImm = OffsetImm / NewOffsetStride; 839 } 840 841 // Construct the new instruction. 842 DebugLoc DL = I->getDebugLoc(); 843 MachineBasicBlock *MBB = I->getParent(); 844 MachineInstrBuilder MIB; 845 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 846 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 847 .add(BaseRegOp) 848 .addImm(OffsetImm) 849 .cloneMergedMemRefs({&*I, &*MergeMI}) 850 .setMIFlags(I->mergeFlagsWith(*MergeMI)); 851 (void)MIB; 852 853 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); 854 LLVM_DEBUG(I->print(dbgs())); 855 LLVM_DEBUG(dbgs() << " "); 856 LLVM_DEBUG(MergeMI->print(dbgs())); 857 LLVM_DEBUG(dbgs() << " with instruction:\n "); 858 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 859 LLVM_DEBUG(dbgs() << "\n"); 860 861 // Erase the old instructions. 862 I->eraseFromParent(); 863 MergeMI->eraseFromParent(); 864 return NextI; 865 } 866 867 // Apply Fn to all instructions between MI and the beginning of the block, until 868 // a def for DefReg is reached. Returns true, iff Fn returns true for all 869 // visited instructions. Stop after visiting Limit iterations. 870 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, 871 const TargetRegisterInfo *TRI, unsigned Limit, 872 std::function<bool(MachineInstr &, bool)> &Fn) { 873 auto MBB = MI.getParent(); 874 for (MachineInstr &I : 875 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { 876 if (!Limit) 877 return false; 878 --Limit; 879 880 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { 881 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && 882 TRI->regsOverlap(MOP.getReg(), DefReg); 883 }); 884 if (!Fn(I, isDef)) 885 return false; 886 if (isDef) 887 break; 888 } 889 return true; 890 } 891 892 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, 893 const TargetRegisterInfo *TRI) { 894 895 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 896 if (MOP.isReg() && MOP.isKill()) 897 Units.removeReg(MOP.getReg()); 898 899 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 900 if (MOP.isReg() && !MOP.isKill()) 901 Units.addReg(MOP.getReg()); 902 } 903 904 MachineBasicBlock::iterator 905 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 906 MachineBasicBlock::iterator Paired, 907 const LdStPairFlags &Flags) { 908 MachineBasicBlock::iterator E = I->getParent()->end(); 909 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 910 // If NextI is the second of the two instructions to be merged, we need 911 // to skip one further. Either way we merge will invalidate the iterator, 912 // and we don't need to scan the new instruction, as it's a pairwise 913 // instruction, which we're not considering for further action anyway. 914 if (NextI == Paired) 915 NextI = next_nodbg(NextI, E); 916 917 int SExtIdx = Flags.getSExtIdx(); 918 unsigned Opc = 919 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 920 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc); 921 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; 922 923 bool MergeForward = Flags.getMergeForward(); 924 925 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg(); 926 if (RenameReg) { 927 MCRegister RegToRename = getLdStRegOp(*I).getReg(); 928 DefinedInBB.addReg(*RenameReg); 929 930 // Return the sub/super register for RenameReg, matching the size of 931 // OriginalReg. 932 auto GetMatchingSubReg = 933 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg { 934 for (MCPhysReg SubOrSuper : 935 TRI->sub_and_superregs_inclusive(*RenameReg)) { 936 if (C->contains(SubOrSuper)) 937 return SubOrSuper; 938 } 939 llvm_unreachable("Should have found matching sub or super register!"); 940 }; 941 942 std::function<bool(MachineInstr &, bool)> UpdateMIs = 943 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI, 944 bool IsDef) { 945 if (IsDef) { 946 bool SeenDef = false; 947 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 948 MachineOperand &MOP = MI.getOperand(OpIdx); 949 // Rename the first explicit definition and all implicit 950 // definitions matching RegToRename. 951 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 952 (!MergeForward || !SeenDef || 953 (MOP.isDef() && MOP.isImplicit())) && 954 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 955 assert((MOP.isImplicit() || 956 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 957 "Need renamable operands"); 958 Register MatchingReg; 959 if (const TargetRegisterClass *RC = 960 MI.getRegClassConstraint(OpIdx, TII, TRI)) 961 MatchingReg = GetMatchingSubReg(RC); 962 else { 963 if (!isRewritableImplicitDef(MI.getOpcode())) 964 continue; 965 MatchingReg = GetMatchingSubReg( 966 TRI->getMinimalPhysRegClass(MOP.getReg())); 967 } 968 MOP.setReg(MatchingReg); 969 SeenDef = true; 970 } 971 } 972 } else { 973 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 974 MachineOperand &MOP = MI.getOperand(OpIdx); 975 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 976 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 977 assert((MOP.isImplicit() || 978 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 979 "Need renamable operands"); 980 Register MatchingReg; 981 if (const TargetRegisterClass *RC = 982 MI.getRegClassConstraint(OpIdx, TII, TRI)) 983 MatchingReg = GetMatchingSubReg(RC); 984 else 985 MatchingReg = GetMatchingSubReg( 986 TRI->getMinimalPhysRegClass(MOP.getReg())); 987 assert(MatchingReg != AArch64::NoRegister && 988 "Cannot find matching regs for renaming"); 989 MOP.setReg(MatchingReg); 990 } 991 } 992 } 993 LLVM_DEBUG(dbgs() << "Renamed " << MI); 994 return true; 995 }; 996 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI, 997 UINT32_MAX, UpdateMIs); 998 999 #if !defined(NDEBUG) 1000 // For forward merging store: 1001 // Make sure the register used for renaming is not used between the 1002 // paired instructions. That would trash the content before the new 1003 // paired instruction. 1004 MCPhysReg RegToCheck = *RenameReg; 1005 // For backward merging load: 1006 // Make sure the register being renamed is not used between the 1007 // paired instructions. That would trash the content after the new 1008 // paired instruction. 1009 if (!MergeForward) 1010 RegToCheck = RegToRename; 1011 for (auto &MI : 1012 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>( 1013 MergeForward ? std::next(I) : I, 1014 MergeForward ? std::next(Paired) : Paired)) 1015 assert(all_of(MI.operands(), 1016 [this, RegToCheck](const MachineOperand &MOP) { 1017 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1018 MOP.isUndef() || 1019 !TRI->regsOverlap(MOP.getReg(), RegToCheck); 1020 }) && 1021 "Rename register used between paired instruction, trashing the " 1022 "content"); 1023 #endif 1024 } 1025 1026 // Insert our new paired instruction after whichever of the paired 1027 // instructions MergeForward indicates. 1028 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 1029 // Also based on MergeForward is from where we copy the base register operand 1030 // so we get the flags compatible with the input code. 1031 const MachineOperand &BaseRegOp = 1032 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) 1033 : AArch64InstrInfo::getLdStBaseOp(*I); 1034 1035 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); 1036 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); 1037 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); 1038 if (IsUnscaled != PairedIsUnscaled) { 1039 // We're trying to pair instructions that differ in how they are scaled. If 1040 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 1041 // the opposite (i.e., make Paired's offset unscaled). 1042 int MemSize = TII->getMemScale(*Paired); 1043 if (PairedIsUnscaled) { 1044 // If the unscaled offset isn't a multiple of the MemSize, we can't 1045 // pair the operations together. 1046 assert(!(PairedOffset % TII->getMemScale(*Paired)) && 1047 "Offset should be a multiple of the stride!"); 1048 PairedOffset /= MemSize; 1049 } else { 1050 PairedOffset *= MemSize; 1051 } 1052 } 1053 1054 // Which register is Rt and which is Rt2 depends on the offset order. 1055 // However, for pre load/stores the Rt should be the one of the pre 1056 // load/store. 1057 MachineInstr *RtMI, *Rt2MI; 1058 if (Offset == PairedOffset + OffsetStride && 1059 !AArch64InstrInfo::isPreLdSt(*I)) { 1060 RtMI = &*Paired; 1061 Rt2MI = &*I; 1062 // Here we swapped the assumption made for SExtIdx. 1063 // I.e., we turn ldp I, Paired into ldp Paired, I. 1064 // Update the index accordingly. 1065 if (SExtIdx != -1) 1066 SExtIdx = (SExtIdx + 1) % 2; 1067 } else { 1068 RtMI = &*I; 1069 Rt2MI = &*Paired; 1070 } 1071 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); 1072 // Scale the immediate offset, if necessary. 1073 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { 1074 assert(!(OffsetImm % TII->getMemScale(*RtMI)) && 1075 "Unscaled offset cannot be scaled."); 1076 OffsetImm /= TII->getMemScale(*RtMI); 1077 } 1078 1079 // Construct the new instruction. 1080 MachineInstrBuilder MIB; 1081 DebugLoc DL = I->getDebugLoc(); 1082 MachineBasicBlock *MBB = I->getParent(); 1083 MachineOperand RegOp0 = getLdStRegOp(*RtMI); 1084 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); 1085 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1; 1086 // Kill flags may become invalid when moving stores for pairing. 1087 if (RegOp0.isUse()) { 1088 if (!MergeForward) { 1089 // Clear kill flags on store if moving upwards. Example: 1090 // STRWui kill %w0, ... 1091 // USE %w1 1092 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards 1093 // We are about to move the store of w1, so its kill flag may become 1094 // invalid; not the case for w0. 1095 // Since w1 is used between the stores, the kill flag on w1 is cleared 1096 // after merging. 1097 // STPWi kill %w0, %w1, ... 1098 // USE %w1 1099 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It) 1100 if (It->readsRegister(PairedRegOp.getReg(), TRI)) 1101 PairedRegOp.setIsKill(false); 1102 } else { 1103 // Clear kill flags of the first stores register. Example: 1104 // STRWui %w1, ... 1105 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards 1106 // STRW %w0 1107 Register Reg = getLdStRegOp(*I).getReg(); 1108 for (MachineInstr &MI : make_range(std::next(I), Paired)) 1109 MI.clearRegisterKills(Reg, TRI); 1110 } 1111 } 1112 1113 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc); 1114 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode)); 1115 1116 // Adds the pre-index operand for pre-indexed ld/st pairs. 1117 if (AArch64InstrInfo::isPreLdSt(*RtMI)) 1118 MIB.addReg(BaseRegOp.getReg(), RegState::Define); 1119 1120 MIB.add(RegOp0) 1121 .add(RegOp1) 1122 .add(BaseRegOp) 1123 .addImm(OffsetImm) 1124 .cloneMergedMemRefs({&*I, &*Paired}) 1125 .setMIFlags(I->mergeFlagsWith(*Paired)); 1126 1127 (void)MIB; 1128 1129 LLVM_DEBUG( 1130 dbgs() << "Creating pair load/store. Replacing instructions:\n "); 1131 LLVM_DEBUG(I->print(dbgs())); 1132 LLVM_DEBUG(dbgs() << " "); 1133 LLVM_DEBUG(Paired->print(dbgs())); 1134 LLVM_DEBUG(dbgs() << " with instruction:\n "); 1135 if (SExtIdx != -1) { 1136 // Generate the sign extension for the proper result of the ldp. 1137 // I.e., with X1, that would be: 1138 // %w1 = KILL %w1, implicit-def %x1 1139 // %x1 = SBFMXri killed %x1, 0, 31 1140 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 1141 // Right now, DstMO has the extended register, since it comes from an 1142 // extended opcode. 1143 Register DstRegX = DstMO.getReg(); 1144 // Get the W variant of that register. 1145 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 1146 // Update the result of LDP to use the W instead of the X variant. 1147 DstMO.setReg(DstRegW); 1148 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1149 LLVM_DEBUG(dbgs() << "\n"); 1150 // Make the machine verifier happy by providing a definition for 1151 // the X register. 1152 // Insert this definition right after the generated LDP, i.e., before 1153 // InsertionPoint. 1154 MachineInstrBuilder MIBKill = 1155 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 1156 .addReg(DstRegW) 1157 .addReg(DstRegX, RegState::Define); 1158 MIBKill->getOperand(2).setImplicit(); 1159 // Create the sign extension. 1160 MachineInstrBuilder MIBSXTW = 1161 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 1162 .addReg(DstRegX) 1163 .addImm(0) 1164 .addImm(31); 1165 (void)MIBSXTW; 1166 LLVM_DEBUG(dbgs() << " Extend operand:\n "); 1167 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 1168 } else { 1169 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1170 } 1171 LLVM_DEBUG(dbgs() << "\n"); 1172 1173 if (MergeForward) 1174 for (const MachineOperand &MOP : phys_regs_and_masks(*I)) 1175 if (MOP.isReg() && MOP.isKill()) 1176 DefinedInBB.addReg(MOP.getReg()); 1177 1178 // Erase the old instructions. 1179 I->eraseFromParent(); 1180 Paired->eraseFromParent(); 1181 1182 return NextI; 1183 } 1184 1185 MachineBasicBlock::iterator 1186 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 1187 MachineBasicBlock::iterator StoreI) { 1188 MachineBasicBlock::iterator NextI = 1189 next_nodbg(LoadI, LoadI->getParent()->end()); 1190 1191 int LoadSize = TII->getMemScale(*LoadI); 1192 int StoreSize = TII->getMemScale(*StoreI); 1193 Register LdRt = getLdStRegOp(*LoadI).getReg(); 1194 const MachineOperand &StMO = getLdStRegOp(*StoreI); 1195 Register StRt = getLdStRegOp(*StoreI).getReg(); 1196 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 1197 1198 assert((IsStoreXReg || 1199 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 1200 "Unexpected RegClass"); 1201 1202 MachineInstr *BitExtMI; 1203 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 1204 // Remove the load, if the destination register of the loads is the same 1205 // register for stored value. 1206 if (StRt == LdRt && LoadSize == 8) { 1207 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1208 LoadI->getIterator())) { 1209 if (MI.killsRegister(StRt, TRI)) { 1210 MI.clearRegisterKills(StRt, TRI); 1211 break; 1212 } 1213 } 1214 LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); 1215 LLVM_DEBUG(LoadI->print(dbgs())); 1216 LLVM_DEBUG(dbgs() << "\n"); 1217 LoadI->eraseFromParent(); 1218 return NextI; 1219 } 1220 // Replace the load with a mov if the load and store are in the same size. 1221 BitExtMI = 1222 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1223 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 1224 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 1225 .add(StMO) 1226 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 1227 .setMIFlags(LoadI->getFlags()); 1228 } else { 1229 // FIXME: Currently we disable this transformation in big-endian targets as 1230 // performance and correctness are verified only in little-endian. 1231 if (!Subtarget->isLittleEndian()) 1232 return NextI; 1233 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI); 1234 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && 1235 "Unsupported ld/st match"); 1236 assert(LoadSize <= StoreSize && "Invalid load size"); 1237 int UnscaledLdOffset = 1238 IsUnscaled 1239 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() 1240 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; 1241 int UnscaledStOffset = 1242 IsUnscaled 1243 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() 1244 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; 1245 int Width = LoadSize * 8; 1246 Register DestReg = 1247 IsStoreXReg ? Register(TRI->getMatchingSuperReg( 1248 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) 1249 : LdRt; 1250 1251 assert((UnscaledLdOffset >= UnscaledStOffset && 1252 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 1253 "Invalid offset"); 1254 1255 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 1256 int Imms = Immr + Width - 1; 1257 if (UnscaledLdOffset == UnscaledStOffset) { 1258 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 1259 | ((Immr) << 6) // immr 1260 | ((Imms) << 0) // imms 1261 ; 1262 1263 BitExtMI = 1264 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1265 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 1266 DestReg) 1267 .add(StMO) 1268 .addImm(AndMaskEncoded) 1269 .setMIFlags(LoadI->getFlags()); 1270 } else { 1271 BitExtMI = 1272 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1273 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 1274 DestReg) 1275 .add(StMO) 1276 .addImm(Immr) 1277 .addImm(Imms) 1278 .setMIFlags(LoadI->getFlags()); 1279 } 1280 } 1281 1282 // Clear kill flags between store and load. 1283 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1284 BitExtMI->getIterator())) 1285 if (MI.killsRegister(StRt, TRI)) { 1286 MI.clearRegisterKills(StRt, TRI); 1287 break; 1288 } 1289 1290 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); 1291 LLVM_DEBUG(StoreI->print(dbgs())); 1292 LLVM_DEBUG(dbgs() << " "); 1293 LLVM_DEBUG(LoadI->print(dbgs())); 1294 LLVM_DEBUG(dbgs() << " with instructions:\n "); 1295 LLVM_DEBUG(StoreI->print(dbgs())); 1296 LLVM_DEBUG(dbgs() << " "); 1297 LLVM_DEBUG((BitExtMI)->print(dbgs())); 1298 LLVM_DEBUG(dbgs() << "\n"); 1299 1300 // Erase the old instructions. 1301 LoadI->eraseFromParent(); 1302 return NextI; 1303 } 1304 1305 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1306 // Convert the byte-offset used by unscaled into an "element" offset used 1307 // by the scaled pair load/store instructions. 1308 if (IsUnscaled) { 1309 // If the byte-offset isn't a multiple of the stride, there's no point 1310 // trying to match it. 1311 if (Offset % OffsetStride) 1312 return false; 1313 Offset /= OffsetStride; 1314 } 1315 return Offset <= 63 && Offset >= -64; 1316 } 1317 1318 // Do alignment, specialized to power of 2 and for signed ints, 1319 // avoiding having to do a C-style cast from uint_64t to int when 1320 // using alignTo from include/llvm/Support/MathExtras.h. 1321 // FIXME: Move this function to include/MathExtras.h? 1322 static int alignTo(int Num, int PowOf2) { 1323 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1324 } 1325 1326 static bool mayAlias(MachineInstr &MIa, 1327 SmallVectorImpl<MachineInstr *> &MemInsns, 1328 AliasAnalysis *AA) { 1329 for (MachineInstr *MIb : MemInsns) 1330 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) 1331 return true; 1332 1333 return false; 1334 } 1335 1336 bool AArch64LoadStoreOpt::findMatchingStore( 1337 MachineBasicBlock::iterator I, unsigned Limit, 1338 MachineBasicBlock::iterator &StoreI) { 1339 MachineBasicBlock::iterator B = I->getParent()->begin(); 1340 MachineBasicBlock::iterator MBBI = I; 1341 MachineInstr &LoadMI = *I; 1342 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); 1343 1344 // If the load is the first instruction in the block, there's obviously 1345 // not any matching store. 1346 if (MBBI == B) 1347 return false; 1348 1349 // Track which register units have been modified and used between the first 1350 // insn and the second insn. 1351 ModifiedRegUnits.clear(); 1352 UsedRegUnits.clear(); 1353 1354 unsigned Count = 0; 1355 do { 1356 MBBI = prev_nodbg(MBBI, B); 1357 MachineInstr &MI = *MBBI; 1358 1359 // Don't count transient instructions towards the search limit since there 1360 // may be different numbers of them if e.g. debug information is present. 1361 if (!MI.isTransient()) 1362 ++Count; 1363 1364 // If the load instruction reads directly from the address to which the 1365 // store instruction writes and the stored value is not modified, we can 1366 // promote the load. Since we do not handle stores with pre-/post-index, 1367 // it's unnecessary to check if BaseReg is modified by the store itself. 1368 // Also we can't handle stores without an immediate offset operand, 1369 // while the operand might be the address for a global variable. 1370 if (MI.mayStore() && isMatchingStore(LoadMI, MI) && 1371 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && 1372 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && 1373 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1374 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { 1375 StoreI = MBBI; 1376 return true; 1377 } 1378 1379 if (MI.isCall()) 1380 return false; 1381 1382 // Update modified / uses register units. 1383 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1384 1385 // Otherwise, if the base register is modified, we have no match, so 1386 // return early. 1387 if (!ModifiedRegUnits.available(BaseReg)) 1388 return false; 1389 1390 // If we encounter a store aliased with the load, return early. 1391 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) 1392 return false; 1393 } while (MBBI != B && Count < Limit); 1394 return false; 1395 } 1396 1397 static bool needsWinCFI(const MachineFunction *MF) { 1398 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && 1399 MF->getFunction().needsUnwindTableEntry(); 1400 } 1401 1402 // Returns true if FirstMI and MI are candidates for merging or pairing. 1403 // Otherwise, returns false. 1404 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, 1405 LdStPairFlags &Flags, 1406 const AArch64InstrInfo *TII) { 1407 // If this is volatile or if pairing is suppressed, not a candidate. 1408 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1409 return false; 1410 1411 // We should have already checked FirstMI for pair suppression and volatility. 1412 assert(!FirstMI.hasOrderedMemoryRef() && 1413 !TII->isLdStPairSuppressed(FirstMI) && 1414 "FirstMI shouldn't get here if either of these checks are true."); 1415 1416 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) || 1417 MI.getFlag(MachineInstr::FrameDestroy))) 1418 return false; 1419 1420 unsigned OpcA = FirstMI.getOpcode(); 1421 unsigned OpcB = MI.getOpcode(); 1422 1423 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check. 1424 if (OpcA == OpcB) 1425 return !AArch64InstrInfo::isPreLdSt(FirstMI); 1426 1427 // Two pre ld/st of different opcodes cannot be merged either 1428 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI)) 1429 return false; 1430 1431 // Try to match a sign-extended load/store with a zero-extended load/store. 1432 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1433 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1434 assert(IsValidLdStrOpc && 1435 "Given Opc should be a Load or Store with an immediate"); 1436 // OpcA will be the first instruction in the pair. 1437 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1438 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1439 return true; 1440 } 1441 1442 // If the second instruction isn't even a mergable/pairable load/store, bail 1443 // out. 1444 if (!PairIsValidLdStrOpc) 1445 return false; 1446 1447 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled 1448 // offsets. 1449 if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) 1450 return false; 1451 1452 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and 1453 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui 1454 // are candidate pairs that can be merged. 1455 if (isPreLdStPairCandidate(FirstMI, MI)) 1456 return true; 1457 1458 // Try to match an unscaled load/store with a scaled load/store. 1459 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) && 1460 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1461 1462 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1463 } 1464 1465 static bool canRenameMOP(const MachineOperand &MOP, 1466 const TargetRegisterInfo *TRI) { 1467 if (MOP.isReg()) { 1468 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); 1469 // Renaming registers with multiple disjunct sub-registers (e.g. the 1470 // result of a LD3) means that all sub-registers are renamed, potentially 1471 // impacting other instructions we did not check. Bail out. 1472 // Note that this relies on the structure of the AArch64 register file. In 1473 // particular, a subregister cannot be written without overwriting the 1474 // whole register. 1475 if (RegClass->HasDisjunctSubRegs) { 1476 LLVM_DEBUG( 1477 dbgs() 1478 << " Cannot rename operands with multiple disjunct subregisters (" 1479 << MOP << ")\n"); 1480 return false; 1481 } 1482 1483 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite 1484 // them must be known. For example, in ORRWrs the implicit-def 1485 // corresponds to the result register. 1486 if (MOP.isImplicit() && MOP.isDef()) { 1487 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode())) 1488 return false; 1489 return TRI->isSuperOrSubRegisterEq( 1490 MOP.getParent()->getOperand(0).getReg(), MOP.getReg()); 1491 } 1492 } 1493 return MOP.isImplicit() || 1494 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); 1495 } 1496 1497 static bool 1498 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, 1499 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1500 const TargetRegisterInfo *TRI) { 1501 if (!FirstMI.mayStore()) 1502 return false; 1503 1504 // Check if we can find an unused register which we can use to rename 1505 // the register used by the first load/store. 1506 1507 auto RegToRename = getLdStRegOp(FirstMI).getReg(); 1508 // For now, we only rename if the store operand gets killed at the store. 1509 if (!getLdStRegOp(FirstMI).isKill() && 1510 !any_of(FirstMI.operands(), 1511 [TRI, RegToRename](const MachineOperand &MOP) { 1512 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 1513 MOP.isImplicit() && MOP.isKill() && 1514 TRI->regsOverlap(RegToRename, MOP.getReg()); 1515 })) { 1516 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI); 1517 return false; 1518 } 1519 1520 bool FoundDef = false; 1521 1522 // For each instruction between FirstMI and the previous def for RegToRename, 1523 // we 1524 // * check if we can rename RegToRename in this instruction 1525 // * collect the registers used and required register classes for RegToRename. 1526 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI, 1527 bool IsDef) { 1528 LLVM_DEBUG(dbgs() << "Checking " << MI); 1529 // Currently we do not try to rename across frame-setup instructions. 1530 if (MI.getFlag(MachineInstr::FrameSetup)) { 1531 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1532 << "currently\n"); 1533 return false; 1534 } 1535 1536 UsedInBetween.accumulate(MI); 1537 1538 // For a definition, check that we can rename the definition and exit the 1539 // loop. 1540 FoundDef = IsDef; 1541 1542 // For defs, check if we can rename the first def of RegToRename. 1543 if (FoundDef) { 1544 // For some pseudo instructions, we might not generate code in the end 1545 // (e.g. KILL) and we would end up without a correct def for the rename 1546 // register. 1547 // TODO: This might be overly conservative and we could handle those cases 1548 // in multiple ways: 1549 // 1. Insert an extra copy, to materialize the def. 1550 // 2. Skip pseudo-defs until we find an non-pseudo def. 1551 if (MI.isPseudo()) { 1552 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n"); 1553 return false; 1554 } 1555 1556 for (auto &MOP : MI.operands()) { 1557 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || 1558 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1559 continue; 1560 if (!canRenameMOP(MOP, TRI)) { 1561 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1562 return false; 1563 } 1564 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1565 } 1566 return true; 1567 } else { 1568 for (auto &MOP : MI.operands()) { 1569 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1570 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1571 continue; 1572 1573 if (!canRenameMOP(MOP, TRI)) { 1574 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1575 return false; 1576 } 1577 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1578 } 1579 } 1580 return true; 1581 }; 1582 1583 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs)) 1584 return false; 1585 1586 if (!FoundDef) { 1587 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n"); 1588 return false; 1589 } 1590 return true; 1591 } 1592 1593 // We want to merge the second load into the first by rewriting the usages of 1594 // the same reg between first (incl.) and second (excl.). We don't need to care 1595 // about any insns before FirstLoad or after SecondLoad. 1596 // 1. The second load writes new value into the same reg. 1597 // - The renaming is impossible to impact later use of the reg. 1598 // - The second load always trash the value written by the first load which 1599 // means the reg must be killed before the second load. 1600 // 2. The first load must be a def for the same reg so we don't need to look 1601 // into anything before it. 1602 static bool canRenameUntilSecondLoad( 1603 MachineInstr &FirstLoad, MachineInstr &SecondLoad, 1604 LiveRegUnits &UsedInBetween, 1605 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1606 const TargetRegisterInfo *TRI) { 1607 if (FirstLoad.isPseudo()) 1608 return false; 1609 1610 UsedInBetween.accumulate(FirstLoad); 1611 auto RegToRename = getLdStRegOp(FirstLoad).getReg(); 1612 bool Success = std::all_of( 1613 FirstLoad.getIterator(), SecondLoad.getIterator(), 1614 [&](MachineInstr &MI) { 1615 LLVM_DEBUG(dbgs() << "Checking " << MI); 1616 // Currently we do not try to rename across frame-setup instructions. 1617 if (MI.getFlag(MachineInstr::FrameSetup)) { 1618 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1619 << "currently\n"); 1620 return false; 1621 } 1622 1623 for (auto &MOP : MI.operands()) { 1624 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1625 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1626 continue; 1627 if (!canRenameMOP(MOP, TRI)) { 1628 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1629 return false; 1630 } 1631 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1632 } 1633 1634 return true; 1635 }); 1636 return Success; 1637 } 1638 1639 // Check if we can find a physical register for renaming \p Reg. This register 1640 // must: 1641 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all 1642 // defined registers up to the point where the renamed register will be used, 1643 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed 1644 // registers in the range the rename register will be used, 1645 // * is available in all used register classes (checked using RequiredClasses). 1646 static std::optional<MCPhysReg> tryToFindRegisterToRename( 1647 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, 1648 LiveRegUnits &UsedInBetween, 1649 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1650 const TargetRegisterInfo *TRI) { 1651 const MachineRegisterInfo &RegInfo = MF.getRegInfo(); 1652 1653 // Checks if any sub- or super-register of PR is callee saved. 1654 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { 1655 return any_of(TRI->sub_and_superregs_inclusive(PR), 1656 [&MF, TRI](MCPhysReg SubOrSuper) { 1657 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF); 1658 }); 1659 }; 1660 1661 // Check if PR or one of its sub- or super-registers can be used for all 1662 // required register classes. 1663 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) { 1664 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) { 1665 return any_of( 1666 TRI->sub_and_superregs_inclusive(PR), 1667 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); }); 1668 }); 1669 }; 1670 1671 auto *RegClass = TRI->getMinimalPhysRegClass(Reg); 1672 for (const MCPhysReg &PR : *RegClass) { 1673 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && 1674 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && 1675 CanBeUsedForAllClasses(PR)) { 1676 DefinedInBB.addReg(PR); 1677 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI) 1678 << "\n"); 1679 return {PR}; 1680 } 1681 } 1682 LLVM_DEBUG(dbgs() << "No rename register found from " 1683 << TRI->getRegClassName(RegClass) << "\n"); 1684 return std::nullopt; 1685 } 1686 1687 // For store pairs: returns a register from FirstMI to the beginning of the 1688 // block that can be renamed. 1689 // For load pairs: returns a register from FirstMI to MI that can be renamed. 1690 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair( 1691 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI, 1692 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, 1693 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1694 const TargetRegisterInfo *TRI) { 1695 std::optional<MCPhysReg> RenameReg; 1696 if (!DebugCounter::shouldExecute(RegRenamingCounter)) 1697 return RenameReg; 1698 1699 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); 1700 MachineFunction &MF = *FirstMI.getParent()->getParent(); 1701 if (!RegClass || !MF.getRegInfo().tracksLiveness()) 1702 return RenameReg; 1703 1704 const bool IsLoad = FirstMI.mayLoad(); 1705 1706 if (!MaybeCanRename) { 1707 if (IsLoad) 1708 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween, 1709 RequiredClasses, TRI)}; 1710 else 1711 MaybeCanRename = { 1712 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)}; 1713 } 1714 1715 if (*MaybeCanRename) { 1716 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween, 1717 RequiredClasses, TRI); 1718 } 1719 return RenameReg; 1720 } 1721 1722 /// Scan the instructions looking for a load/store that can be combined with the 1723 /// current instruction into a wider equivalent or a load/store pair. 1724 MachineBasicBlock::iterator 1725 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1726 LdStPairFlags &Flags, unsigned Limit, 1727 bool FindNarrowMerge) { 1728 MachineBasicBlock::iterator E = I->getParent()->end(); 1729 MachineBasicBlock::iterator MBBI = I; 1730 MachineBasicBlock::iterator MBBIWithRenameReg; 1731 MachineInstr &FirstMI = *I; 1732 MBBI = next_nodbg(MBBI, E); 1733 1734 bool MayLoad = FirstMI.mayLoad(); 1735 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); 1736 Register Reg = getLdStRegOp(FirstMI).getReg(); 1737 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); 1738 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); 1739 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; 1740 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1741 1742 std::optional<bool> MaybeCanRename; 1743 if (!EnableRenaming) 1744 MaybeCanRename = {false}; 1745 1746 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; 1747 LiveRegUnits UsedInBetween; 1748 UsedInBetween.init(*TRI); 1749 1750 Flags.clearRenameReg(); 1751 1752 // Track which register units have been modified and used between the first 1753 // insn (inclusive) and the second insn. 1754 ModifiedRegUnits.clear(); 1755 UsedRegUnits.clear(); 1756 1757 // Remember any instructions that read/write memory between FirstMI and MI. 1758 SmallVector<MachineInstr *, 4> MemInsns; 1759 1760 for (unsigned Count = 0; MBBI != E && Count < Limit; 1761 MBBI = next_nodbg(MBBI, E)) { 1762 MachineInstr &MI = *MBBI; 1763 1764 UsedInBetween.accumulate(MI); 1765 1766 // Don't count transient instructions towards the search limit since there 1767 // may be different numbers of them if e.g. debug information is present. 1768 if (!MI.isTransient()) 1769 ++Count; 1770 1771 Flags.setSExtIdx(-1); 1772 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && 1773 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { 1774 assert(MI.mayLoadOrStore() && "Expected memory operation."); 1775 // If we've found another instruction with the same opcode, check to see 1776 // if the base and offset are compatible with our starting instruction. 1777 // These instructions all have scaled immediate operands, so we just 1778 // check for +1/-1. Make sure to check the new instruction offset is 1779 // actually an immediate and not a symbolic reference destined for 1780 // a relocation. 1781 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); 1782 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 1783 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); 1784 if (IsUnscaled != MIIsUnscaled) { 1785 // We're trying to pair instructions that differ in how they are scaled. 1786 // If FirstMI is scaled then scale the offset of MI accordingly. 1787 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1788 int MemSize = TII->getMemScale(MI); 1789 if (MIIsUnscaled) { 1790 // If the unscaled offset isn't a multiple of the MemSize, we can't 1791 // pair the operations together: bail and keep looking. 1792 if (MIOffset % MemSize) { 1793 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1794 UsedRegUnits, TRI); 1795 MemInsns.push_back(&MI); 1796 continue; 1797 } 1798 MIOffset /= MemSize; 1799 } else { 1800 MIOffset *= MemSize; 1801 } 1802 } 1803 1804 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI); 1805 1806 if (BaseReg == MIBaseReg) { 1807 // If the offset of the second ld/st is not equal to the size of the 1808 // destination register it can’t be paired with a pre-index ld/st 1809 // pair. Additionally if the base reg is used or modified the operations 1810 // can't be paired: bail and keep looking. 1811 if (IsPreLdSt) { 1812 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); 1813 bool IsBaseRegUsed = !UsedRegUnits.available( 1814 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1815 bool IsBaseRegModified = !ModifiedRegUnits.available( 1816 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1817 // If the stored value and the address of the second instruction is 1818 // the same, it needs to be using the updated register and therefore 1819 // it must not be folded. 1820 bool IsMIRegTheSame = 1821 TRI->regsOverlap(getLdStRegOp(MI).getReg(), 1822 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1823 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || 1824 IsMIRegTheSame) { 1825 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1826 UsedRegUnits, TRI); 1827 MemInsns.push_back(&MI); 1828 continue; 1829 } 1830 } else { 1831 if ((Offset != MIOffset + OffsetStride) && 1832 (Offset + OffsetStride != MIOffset)) { 1833 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1834 UsedRegUnits, TRI); 1835 MemInsns.push_back(&MI); 1836 continue; 1837 } 1838 } 1839 1840 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1841 if (FindNarrowMerge) { 1842 // If the alignment requirements of the scaled wide load/store 1843 // instruction can't express the offset of the scaled narrow input, 1844 // bail and keep looking. For promotable zero stores, allow only when 1845 // the stored value is the same (i.e., WZR). 1846 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || 1847 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1848 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1849 UsedRegUnits, TRI); 1850 MemInsns.push_back(&MI); 1851 continue; 1852 } 1853 } else { 1854 // Pairwise instructions have a 7-bit signed offset field. Single 1855 // insns have a 12-bit unsigned offset field. If the resultant 1856 // immediate offset of merging these instructions is out of range for 1857 // a pairwise instruction, bail and keep looking. 1858 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1859 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1860 UsedRegUnits, TRI); 1861 MemInsns.push_back(&MI); 1862 continue; 1863 } 1864 // If the alignment requirements of the paired (scaled) instruction 1865 // can't express the offset of the unscaled input, bail and keep 1866 // looking. 1867 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1868 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1869 UsedRegUnits, TRI); 1870 MemInsns.push_back(&MI); 1871 continue; 1872 } 1873 } 1874 1875 // If the BaseReg has been modified, then we cannot do the optimization. 1876 // For example, in the following pattern 1877 // ldr x1 [x2] 1878 // ldr x2 [x3] 1879 // ldr x4 [x2, #8], 1880 // the first and third ldr cannot be converted to ldp x1, x4, [x2] 1881 if (!ModifiedRegUnits.available(BaseReg)) 1882 return E; 1883 1884 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq( 1885 Reg, getLdStRegOp(MI).getReg()); 1886 1887 // If the Rt of the second instruction was not modified or used between 1888 // the two instructions and none of the instructions between the second 1889 // and first alias with the second, we can combine the second into the 1890 // first. 1891 if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) && 1892 !(MI.mayLoad() && !SameLoadReg && 1893 !UsedRegUnits.available(getLdStRegOp(MI).getReg())) && 1894 !mayAlias(MI, MemInsns, AA)) { 1895 // For pairs loading into the same reg, try to find a renaming 1896 // opportunity to allow the renaming of Reg between FirstMI and MI 1897 // and combine MI into FirstMI; otherwise bail and keep looking. 1898 if (SameLoadReg) { 1899 std::optional<MCPhysReg> RenameReg = 1900 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI, 1901 Reg, DefinedInBB, UsedInBetween, 1902 RequiredClasses, TRI); 1903 if (!RenameReg) { 1904 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1905 UsedRegUnits, TRI); 1906 MemInsns.push_back(&MI); 1907 continue; 1908 } 1909 Flags.setRenameReg(*RenameReg); 1910 } 1911 1912 Flags.setMergeForward(false); 1913 if (!SameLoadReg) 1914 Flags.clearRenameReg(); 1915 return MBBI; 1916 } 1917 1918 // Likewise, if the Rt of the first instruction is not modified or used 1919 // between the two instructions and none of the instructions between the 1920 // first and the second alias with the first, we can combine the first 1921 // into the second. 1922 if (!(MayLoad && 1923 !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) && 1924 !mayAlias(FirstMI, MemInsns, AA)) { 1925 1926 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) { 1927 Flags.setMergeForward(true); 1928 Flags.clearRenameReg(); 1929 return MBBI; 1930 } 1931 1932 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair( 1933 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween, 1934 RequiredClasses, TRI); 1935 if (RenameReg) { 1936 Flags.setMergeForward(true); 1937 Flags.setRenameReg(*RenameReg); 1938 MBBIWithRenameReg = MBBI; 1939 } 1940 } 1941 // Unable to combine these instructions due to interference in between. 1942 // Keep looking. 1943 } 1944 } 1945 1946 if (Flags.getRenameReg()) 1947 return MBBIWithRenameReg; 1948 1949 // If the instruction wasn't a matching load or store. Stop searching if we 1950 // encounter a call instruction that might modify memory. 1951 if (MI.isCall()) 1952 return E; 1953 1954 // Update modified / uses register units. 1955 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1956 1957 // Otherwise, if the base register is modified, we have no match, so 1958 // return early. 1959 if (!ModifiedRegUnits.available(BaseReg)) 1960 return E; 1961 1962 // Update list of instructions that read/write memory. 1963 if (MI.mayLoadOrStore()) 1964 MemInsns.push_back(&MI); 1965 } 1966 return E; 1967 } 1968 1969 static MachineBasicBlock::iterator 1970 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { 1971 auto End = MI.getParent()->end(); 1972 if (MaybeCFI == End || 1973 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || 1974 !(MI.getFlag(MachineInstr::FrameSetup) || 1975 MI.getFlag(MachineInstr::FrameDestroy)) || 1976 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) 1977 return End; 1978 1979 const MachineFunction &MF = *MI.getParent()->getParent(); 1980 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex(); 1981 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex]; 1982 switch (CFI.getOperation()) { 1983 case MCCFIInstruction::OpDefCfa: 1984 case MCCFIInstruction::OpDefCfaOffset: 1985 return MaybeCFI; 1986 default: 1987 return End; 1988 } 1989 } 1990 1991 MachineBasicBlock::iterator 1992 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1993 MachineBasicBlock::iterator Update, 1994 bool IsPreIdx) { 1995 assert((Update->getOpcode() == AArch64::ADDXri || 1996 Update->getOpcode() == AArch64::SUBXri) && 1997 "Unexpected base register update instruction to merge!"); 1998 MachineBasicBlock::iterator E = I->getParent()->end(); 1999 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 2000 2001 // If updating the SP and the following instruction is CFA offset related CFI 2002 // instruction move it after the merged instruction. 2003 MachineBasicBlock::iterator CFI = 2004 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E; 2005 2006 // Return the instruction following the merged instruction, which is 2007 // the instruction following our unmerged load. Unless that's the add/sub 2008 // instruction we're merging, in which case it's the one after that. 2009 if (NextI == Update) 2010 NextI = next_nodbg(NextI, E); 2011 2012 int Value = Update->getOperand(2).getImm(); 2013 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 2014 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 2015 if (Update->getOpcode() == AArch64::SUBXri) 2016 Value = -Value; 2017 2018 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 2019 : getPostIndexedOpcode(I->getOpcode()); 2020 MachineInstrBuilder MIB; 2021 int Scale, MinOffset, MaxOffset; 2022 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); 2023 if (!AArch64InstrInfo::isPairedLdSt(*I)) { 2024 // Non-paired instruction. 2025 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2026 .add(getLdStRegOp(*Update)) 2027 .add(getLdStRegOp(*I)) 2028 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2029 .addImm(Value / Scale) 2030 .setMemRefs(I->memoperands()) 2031 .setMIFlags(I->mergeFlagsWith(*Update)); 2032 } else { 2033 // Paired instruction. 2034 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2035 .add(getLdStRegOp(*Update)) 2036 .add(getLdStRegOp(*I, 0)) 2037 .add(getLdStRegOp(*I, 1)) 2038 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2039 .addImm(Value / Scale) 2040 .setMemRefs(I->memoperands()) 2041 .setMIFlags(I->mergeFlagsWith(*Update)); 2042 } 2043 if (CFI != E) { 2044 MachineBasicBlock *MBB = I->getParent(); 2045 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI); 2046 } 2047 2048 if (IsPreIdx) { 2049 ++NumPreFolded; 2050 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); 2051 } else { 2052 ++NumPostFolded; 2053 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); 2054 } 2055 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2056 LLVM_DEBUG(I->print(dbgs())); 2057 LLVM_DEBUG(dbgs() << " "); 2058 LLVM_DEBUG(Update->print(dbgs())); 2059 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2060 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 2061 LLVM_DEBUG(dbgs() << "\n"); 2062 2063 // Erase the old instructions for the block. 2064 I->eraseFromParent(); 2065 Update->eraseFromParent(); 2066 2067 return NextI; 2068 } 2069 2070 MachineBasicBlock::iterator 2071 AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I, 2072 MachineBasicBlock::iterator Update, 2073 unsigned Offset, int Scale) { 2074 assert((Update->getOpcode() == AArch64::MOVKWi) && 2075 "Unexpected const mov instruction to merge!"); 2076 MachineBasicBlock::iterator E = I->getParent()->end(); 2077 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 2078 MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E); 2079 MachineInstr &MemMI = *I; 2080 unsigned Mask = (1 << 12) * Scale - 1; 2081 unsigned Low = Offset & Mask; 2082 unsigned High = Offset - Low; 2083 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2084 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); 2085 MachineInstrBuilder AddMIB, MemMIB; 2086 2087 // Add IndexReg, BaseReg, High (the BaseReg may be SP) 2088 AddMIB = 2089 BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri)) 2090 .addDef(IndexReg) 2091 .addUse(BaseReg) 2092 .addImm(High >> 12) // shifted value 2093 .addImm(12); // shift 12 2094 (void)AddMIB; 2095 // Ld/St DestReg, IndexReg, Imm12 2096 unsigned NewOpc = getBaseAddressOpcode(I->getOpcode()); 2097 MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2098 .add(getLdStRegOp(MemMI)) 2099 .add(AArch64InstrInfo::getLdStOffsetOp(MemMI)) 2100 .addImm(Low / Scale) 2101 .setMemRefs(I->memoperands()) 2102 .setMIFlags(I->mergeFlagsWith(*Update)); 2103 (void)MemMIB; 2104 2105 ++NumConstOffsetFolded; 2106 LLVM_DEBUG(dbgs() << "Creating base address load/store.\n"); 2107 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2108 LLVM_DEBUG(PrevI->print(dbgs())); 2109 LLVM_DEBUG(dbgs() << " "); 2110 LLVM_DEBUG(Update->print(dbgs())); 2111 LLVM_DEBUG(dbgs() << " "); 2112 LLVM_DEBUG(I->print(dbgs())); 2113 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2114 LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs())); 2115 LLVM_DEBUG(dbgs() << " "); 2116 LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs())); 2117 LLVM_DEBUG(dbgs() << "\n"); 2118 2119 // Erase the old instructions for the block. 2120 I->eraseFromParent(); 2121 PrevI->eraseFromParent(); 2122 Update->eraseFromParent(); 2123 2124 return NextI; 2125 } 2126 2127 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, 2128 MachineInstr &MI, 2129 unsigned BaseReg, int Offset) { 2130 switch (MI.getOpcode()) { 2131 default: 2132 break; 2133 case AArch64::SUBXri: 2134 case AArch64::ADDXri: 2135 // Make sure it's a vanilla immediate operand, not a relocation or 2136 // anything else we can't handle. 2137 if (!MI.getOperand(2).isImm()) 2138 break; 2139 // Watch out for 1 << 12 shifted value. 2140 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) 2141 break; 2142 2143 // The update instruction source and destination register must be the 2144 // same as the load/store base register. 2145 if (MI.getOperand(0).getReg() != BaseReg || 2146 MI.getOperand(1).getReg() != BaseReg) 2147 break; 2148 2149 int UpdateOffset = MI.getOperand(2).getImm(); 2150 if (MI.getOpcode() == AArch64::SUBXri) 2151 UpdateOffset = -UpdateOffset; 2152 2153 // The immediate must be a multiple of the scaling factor of the pre/post 2154 // indexed instruction. 2155 int Scale, MinOffset, MaxOffset; 2156 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); 2157 if (UpdateOffset % Scale != 0) 2158 break; 2159 2160 // Scaled offset must fit in the instruction immediate. 2161 int ScaledOffset = UpdateOffset / Scale; 2162 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) 2163 break; 2164 2165 // If we have a non-zero Offset, we check that it matches the amount 2166 // we're adding to the register. 2167 if (!Offset || Offset == UpdateOffset) 2168 return true; 2169 break; 2170 } 2171 return false; 2172 } 2173 2174 bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI, 2175 MachineInstr &MI, 2176 unsigned IndexReg, 2177 unsigned &Offset) { 2178 // The update instruction source and destination register must be the 2179 // same as the load/store index register. 2180 if (MI.getOpcode() == AArch64::MOVKWi && 2181 TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) { 2182 2183 // movz + movk hold a large offset of a Ld/St instruction. 2184 MachineBasicBlock::iterator B = MI.getParent()->begin(); 2185 MachineBasicBlock::iterator MBBI = &MI; 2186 MBBI = prev_nodbg(MBBI, B); 2187 MachineInstr &MovzMI = *MBBI; 2188 if (MovzMI.getOpcode() == AArch64::MOVZWi) { 2189 unsigned Low = MovzMI.getOperand(1).getImm(); 2190 unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm(); 2191 Offset = High + Low; 2192 // 12-bit optionally shifted immediates are legal for adds. 2193 return Offset >> 24 == 0; 2194 } 2195 } 2196 return false; 2197 } 2198 2199 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 2200 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 2201 MachineBasicBlock::iterator E = I->getParent()->end(); 2202 MachineInstr &MemMI = *I; 2203 MachineBasicBlock::iterator MBBI = I; 2204 2205 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2206 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * 2207 TII->getMemScale(MemMI); 2208 2209 // Scan forward looking for post-index opportunities. Updating instructions 2210 // can't be formed if the memory instruction doesn't have the offset we're 2211 // looking for. 2212 if (MIUnscaledOffset != UnscaledOffset) 2213 return E; 2214 2215 // If the base register overlaps a source/destination register, we can't 2216 // merge the update. This does not apply to tag store instructions which 2217 // ignore the address part of the source register. 2218 // This does not apply to STGPi as well, which does not have unpredictable 2219 // behavior in this case unlike normal stores, and always performs writeback 2220 // after reading the source register value. 2221 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { 2222 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2223 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2224 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2225 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2226 return E; 2227 } 2228 } 2229 2230 // Track which register units have been modified and used between the first 2231 // insn (inclusive) and the second insn. 2232 ModifiedRegUnits.clear(); 2233 UsedRegUnits.clear(); 2234 MBBI = next_nodbg(MBBI, E); 2235 2236 // We can't post-increment the stack pointer if any instruction between 2237 // the memory access (I) and the increment (MBBI) can access the memory 2238 // region defined by [SP, MBBI]. 2239 const bool BaseRegSP = BaseReg == AArch64::SP; 2240 if (BaseRegSP && needsWinCFI(I->getMF())) { 2241 // FIXME: For now, we always block the optimization over SP in windows 2242 // targets as it requires to adjust the unwind/debug info, messing up 2243 // the unwind info can actually cause a miscompile. 2244 return E; 2245 } 2246 2247 for (unsigned Count = 0; MBBI != E && Count < Limit; 2248 MBBI = next_nodbg(MBBI, E)) { 2249 MachineInstr &MI = *MBBI; 2250 2251 // Don't count transient instructions towards the search limit since there 2252 // may be different numbers of them if e.g. debug information is present. 2253 if (!MI.isTransient()) 2254 ++Count; 2255 2256 // If we found a match, return it. 2257 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) 2258 return MBBI; 2259 2260 // Update the status of what the instruction clobbered and used. 2261 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2262 2263 // Otherwise, if the base register is used or modified, we have no match, so 2264 // return early. 2265 // If we are optimizing SP, do not allow instructions that may load or store 2266 // in between the load and the optimized value update. 2267 if (!ModifiedRegUnits.available(BaseReg) || 2268 !UsedRegUnits.available(BaseReg) || 2269 (BaseRegSP && MBBI->mayLoadOrStore())) 2270 return E; 2271 } 2272 return E; 2273 } 2274 2275 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 2276 MachineBasicBlock::iterator I, unsigned Limit) { 2277 MachineBasicBlock::iterator B = I->getParent()->begin(); 2278 MachineBasicBlock::iterator E = I->getParent()->end(); 2279 MachineInstr &MemMI = *I; 2280 MachineBasicBlock::iterator MBBI = I; 2281 MachineFunction &MF = *MemMI.getMF(); 2282 2283 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2284 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); 2285 2286 // If the load/store is the first instruction in the block, there's obviously 2287 // not any matching update. Ditto if the memory offset isn't zero. 2288 if (MBBI == B || Offset != 0) 2289 return E; 2290 // If the base register overlaps a destination register, we can't 2291 // merge the update. 2292 if (!isTagStore(MemMI)) { 2293 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2294 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2295 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2296 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2297 return E; 2298 } 2299 } 2300 2301 const bool BaseRegSP = BaseReg == AArch64::SP; 2302 if (BaseRegSP && needsWinCFI(I->getMF())) { 2303 // FIXME: For now, we always block the optimization over SP in windows 2304 // targets as it requires to adjust the unwind/debug info, messing up 2305 // the unwind info can actually cause a miscompile. 2306 return E; 2307 } 2308 2309 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2310 unsigned RedZoneSize = 2311 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); 2312 2313 // Track which register units have been modified and used between the first 2314 // insn (inclusive) and the second insn. 2315 ModifiedRegUnits.clear(); 2316 UsedRegUnits.clear(); 2317 unsigned Count = 0; 2318 bool MemAcessBeforeSPPreInc = false; 2319 do { 2320 MBBI = prev_nodbg(MBBI, B); 2321 MachineInstr &MI = *MBBI; 2322 2323 // Don't count transient instructions towards the search limit since there 2324 // may be different numbers of them if e.g. debug information is present. 2325 if (!MI.isTransient()) 2326 ++Count; 2327 2328 // If we found a match, return it. 2329 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) { 2330 // Check that the update value is within our red zone limit (which may be 2331 // zero). 2332 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize) 2333 return E; 2334 return MBBI; 2335 } 2336 2337 // Update the status of what the instruction clobbered and used. 2338 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2339 2340 // Otherwise, if the base register is used or modified, we have no match, so 2341 // return early. 2342 if (!ModifiedRegUnits.available(BaseReg) || 2343 !UsedRegUnits.available(BaseReg)) 2344 return E; 2345 // Keep track if we have a memory access before an SP pre-increment, in this 2346 // case we need to validate later that the update amount respects the red 2347 // zone. 2348 if (BaseRegSP && MBBI->mayLoadOrStore()) 2349 MemAcessBeforeSPPreInc = true; 2350 } while (MBBI != B && Count < Limit); 2351 return E; 2352 } 2353 2354 MachineBasicBlock::iterator 2355 AArch64LoadStoreOpt::findMatchingConstOffsetBackward( 2356 MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) { 2357 MachineBasicBlock::iterator B = I->getParent()->begin(); 2358 MachineBasicBlock::iterator E = I->getParent()->end(); 2359 MachineInstr &MemMI = *I; 2360 MachineBasicBlock::iterator MBBI = I; 2361 2362 // If the load is the first instruction in the block, there's obviously 2363 // not any matching load or store. 2364 if (MBBI == B) 2365 return E; 2366 2367 // Make sure the IndexReg is killed and the shift amount is zero. 2368 // TODO: Relex this restriction to extend, simplify processing now. 2369 if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() || 2370 !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() || 2371 (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0)) 2372 return E; 2373 2374 Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg(); 2375 2376 // Track which register units have been modified and used between the first 2377 // insn (inclusive) and the second insn. 2378 ModifiedRegUnits.clear(); 2379 UsedRegUnits.clear(); 2380 unsigned Count = 0; 2381 do { 2382 MBBI = prev_nodbg(MBBI, B); 2383 MachineInstr &MI = *MBBI; 2384 2385 // Don't count transient instructions towards the search limit since there 2386 // may be different numbers of them if e.g. debug information is present. 2387 if (!MI.isTransient()) 2388 ++Count; 2389 2390 // If we found a match, return it. 2391 if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) { 2392 return MBBI; 2393 } 2394 2395 // Update the status of what the instruction clobbered and used. 2396 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2397 2398 // Otherwise, if the index register is used or modified, we have no match, 2399 // so return early. 2400 if (!ModifiedRegUnits.available(IndexReg) || 2401 !UsedRegUnits.available(IndexReg)) 2402 return E; 2403 2404 } while (MBBI != B && Count < Limit); 2405 return E; 2406 } 2407 2408 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 2409 MachineBasicBlock::iterator &MBBI) { 2410 MachineInstr &MI = *MBBI; 2411 // If this is a volatile load, don't mess with it. 2412 if (MI.hasOrderedMemoryRef()) 2413 return false; 2414 2415 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy)) 2416 return false; 2417 2418 // Make sure this is a reg+imm. 2419 // FIXME: It is possible to extend it to handle reg+reg cases. 2420 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 2421 return false; 2422 2423 // Look backward up to LdStLimit instructions. 2424 MachineBasicBlock::iterator StoreI; 2425 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 2426 ++NumLoadsFromStoresPromoted; 2427 // Promote the load. Keeping the iterator straight is a 2428 // pain, so we let the merge routine tell us what the next instruction 2429 // is after it's done mucking about. 2430 MBBI = promoteLoadFromStore(MBBI, StoreI); 2431 return true; 2432 } 2433 return false; 2434 } 2435 2436 // Merge adjacent zero stores into a wider store. 2437 bool AArch64LoadStoreOpt::tryToMergeZeroStInst( 2438 MachineBasicBlock::iterator &MBBI) { 2439 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); 2440 MachineInstr &MI = *MBBI; 2441 MachineBasicBlock::iterator E = MI.getParent()->end(); 2442 2443 if (!TII->isCandidateToMergeOrPair(MI)) 2444 return false; 2445 2446 // Look ahead up to LdStLimit instructions for a mergable instruction. 2447 LdStPairFlags Flags; 2448 MachineBasicBlock::iterator MergeMI = 2449 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); 2450 if (MergeMI != E) { 2451 ++NumZeroStoresPromoted; 2452 2453 // Keeping the iterator straight is a pain, so we let the merge routine tell 2454 // us what the next instruction is after it's done mucking about. 2455 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); 2456 return true; 2457 } 2458 return false; 2459 } 2460 2461 // Find loads and stores that can be merged into a single load or store pair 2462 // instruction. 2463 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 2464 MachineInstr &MI = *MBBI; 2465 MachineBasicBlock::iterator E = MI.getParent()->end(); 2466 2467 if (!TII->isCandidateToMergeOrPair(MI)) 2468 return false; 2469 2470 // If disable-ldp feature is opted, do not emit ldp. 2471 if (MI.mayLoad() && Subtarget->hasDisableLdp()) 2472 return false; 2473 2474 // If disable-stp feature is opted, do not emit stp. 2475 if (MI.mayStore() && Subtarget->hasDisableStp()) 2476 return false; 2477 2478 // Early exit if the offset is not possible to match. (6 bits of positive 2479 // range, plus allow an extra one in case we find a later insn that matches 2480 // with Offset-1) 2481 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); 2482 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 2483 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; 2484 // Allow one more for offset. 2485 if (Offset > 0) 2486 Offset -= OffsetStride; 2487 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 2488 return false; 2489 2490 // Look ahead up to LdStLimit instructions for a pairable instruction. 2491 LdStPairFlags Flags; 2492 MachineBasicBlock::iterator Paired = 2493 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); 2494 if (Paired != E) { 2495 ++NumPairCreated; 2496 if (TII->hasUnscaledLdStOffset(MI)) 2497 ++NumUnscaledPairCreated; 2498 // Keeping the iterator straight is a pain, so we let the merge routine tell 2499 // us what the next instruction is after it's done mucking about. 2500 auto Prev = std::prev(MBBI); 2501 2502 // Fetch the memoperand of the load/store that is a candidate for 2503 // combination. 2504 MachineMemOperand *MemOp = 2505 MI.memoperands_empty() ? nullptr : MI.memoperands().front(); 2506 2507 // Get the needed alignments to check them if 2508 // ldp-aligned-only/stp-aligned-only features are opted. 2509 uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1; 2510 uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1; 2511 2512 // If a load arrives and ldp-aligned-only feature is opted, check that the 2513 // alignment of the source pointer is at least double the alignment of the 2514 // type. 2515 if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp && 2516 MemAlignment < 2 * TypeAlignment) 2517 return false; 2518 2519 // If a store arrives and stp-aligned-only feature is opted, check that the 2520 // alignment of the source pointer is at least double the alignment of the 2521 // type. 2522 if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp && 2523 MemAlignment < 2 * TypeAlignment) 2524 return false; 2525 2526 MBBI = mergePairedInsns(MBBI, Paired, Flags); 2527 // Collect liveness info for instructions between Prev and the new position 2528 // MBBI. 2529 for (auto I = std::next(Prev); I != MBBI; I++) 2530 updateDefinedRegisters(*I, DefinedInBB, TRI); 2531 2532 return true; 2533 } 2534 return false; 2535 } 2536 2537 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate 2538 (MachineBasicBlock::iterator &MBBI) { 2539 MachineInstr &MI = *MBBI; 2540 MachineBasicBlock::iterator E = MI.getParent()->end(); 2541 MachineBasicBlock::iterator Update; 2542 2543 // Look forward to try to form a post-index instruction. For example, 2544 // ldr x0, [x20] 2545 // add x20, x20, #32 2546 // merged into: 2547 // ldr x0, [x20], #32 2548 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 2549 if (Update != E) { 2550 // Merge the update into the ld/st. 2551 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 2552 return true; 2553 } 2554 2555 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2556 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2557 return false; 2558 2559 // Look back to try to find a pre-index instruction. For example, 2560 // add x0, x0, #8 2561 // ldr x1, [x0] 2562 // merged into: 2563 // ldr x1, [x0, #8]! 2564 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 2565 if (Update != E) { 2566 // Merge the update into the ld/st. 2567 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2568 return true; 2569 } 2570 2571 // The immediate in the load/store is scaled by the size of the memory 2572 // operation. The immediate in the add we're looking for, 2573 // however, is not, so adjust here. 2574 int UnscaledOffset = 2575 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); 2576 2577 // Look forward to try to find a pre-index instruction. For example, 2578 // ldr x1, [x0, #64] 2579 // add x0, x0, #64 2580 // merged into: 2581 // ldr x1, [x0, #64]! 2582 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 2583 if (Update != E) { 2584 // Merge the update into the ld/st. 2585 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2586 return true; 2587 } 2588 2589 return false; 2590 } 2591 2592 bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, 2593 int Scale) { 2594 MachineInstr &MI = *MBBI; 2595 MachineBasicBlock::iterator E = MI.getParent()->end(); 2596 MachineBasicBlock::iterator Update; 2597 2598 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2599 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2600 return false; 2601 2602 // Look back to try to find a const offset for index LdSt instruction. For 2603 // example, 2604 // mov x8, #LargeImm ; = a * (1<<12) + imm12 2605 // ldr x1, [x0, x8] 2606 // merged into: 2607 // add x8, x0, a * (1<<12) 2608 // ldr x1, [x8, imm12] 2609 unsigned Offset; 2610 Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset); 2611 if (Update != E && (Offset & (Scale - 1)) == 0) { 2612 // Merge the imm12 into the ld/st. 2613 MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale); 2614 return true; 2615 } 2616 2617 return false; 2618 } 2619 2620 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 2621 bool EnableNarrowZeroStOpt) { 2622 2623 bool Modified = false; 2624 // Four tranformations to do here: 2625 // 1) Find loads that directly read from stores and promote them by 2626 // replacing with mov instructions. If the store is wider than the load, 2627 // the load will be replaced with a bitfield extract. 2628 // e.g., 2629 // str w1, [x0, #4] 2630 // ldrh w2, [x0, #6] 2631 // ; becomes 2632 // str w1, [x0, #4] 2633 // lsr w2, w1, #16 2634 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2635 MBBI != E;) { 2636 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) 2637 Modified = true; 2638 else 2639 ++MBBI; 2640 } 2641 // 2) Merge adjacent zero stores into a wider store. 2642 // e.g., 2643 // strh wzr, [x0] 2644 // strh wzr, [x0, #2] 2645 // ; becomes 2646 // str wzr, [x0] 2647 // e.g., 2648 // str wzr, [x0] 2649 // str wzr, [x0, #4] 2650 // ; becomes 2651 // str xzr, [x0] 2652 if (EnableNarrowZeroStOpt) 2653 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2654 MBBI != E;) { 2655 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) 2656 Modified = true; 2657 else 2658 ++MBBI; 2659 } 2660 // 3) Find loads and stores that can be merged into a single load or store 2661 // pair instruction. 2662 // e.g., 2663 // ldr x0, [x2] 2664 // ldr x1, [x2, #8] 2665 // ; becomes 2666 // ldp x0, x1, [x2] 2667 2668 if (MBB.getParent()->getRegInfo().tracksLiveness()) { 2669 DefinedInBB.clear(); 2670 DefinedInBB.addLiveIns(MBB); 2671 } 2672 2673 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2674 MBBI != E;) { 2675 // Track currently live registers up to this point, to help with 2676 // searching for a rename register on demand. 2677 updateDefinedRegisters(*MBBI, DefinedInBB, TRI); 2678 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) 2679 Modified = true; 2680 else 2681 ++MBBI; 2682 } 2683 // 4) Find base register updates that can be merged into the load or store 2684 // as a base-reg writeback. 2685 // e.g., 2686 // ldr x0, [x2] 2687 // add x2, x2, #4 2688 // ; becomes 2689 // ldr x0, [x2], #4 2690 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2691 MBBI != E;) { 2692 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) 2693 Modified = true; 2694 else 2695 ++MBBI; 2696 } 2697 2698 // 5) Find a register assigned with a const value that can be combined with 2699 // into the load or store. e.g., 2700 // mov x8, #LargeImm ; = a * (1<<12) + imm12 2701 // ldr x1, [x0, x8] 2702 // ; becomes 2703 // add x8, x0, a * (1<<12) 2704 // ldr x1, [x8, imm12] 2705 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2706 MBBI != E;) { 2707 int Scale; 2708 if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale)) 2709 Modified = true; 2710 else 2711 ++MBBI; 2712 } 2713 2714 return Modified; 2715 } 2716 2717 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 2718 if (skipFunction(Fn.getFunction())) 2719 return false; 2720 2721 Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); 2722 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 2723 TRI = Subtarget->getRegisterInfo(); 2724 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2725 2726 // Resize the modified and used register unit trackers. We do this once 2727 // per function and then clear the register units each time we optimize a load 2728 // or store. 2729 ModifiedRegUnits.init(*TRI); 2730 UsedRegUnits.init(*TRI); 2731 DefinedInBB.init(*TRI); 2732 2733 bool Modified = false; 2734 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); 2735 for (auto &MBB : Fn) { 2736 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt); 2737 Modified |= M; 2738 } 2739 2740 return Modified; 2741 } 2742 2743 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and 2744 // stores near one another? Note: The pre-RA instruction scheduler already has 2745 // hooks to try and schedule pairable loads/stores together to improve pairing 2746 // opportunities. Thus, pre-RA pairing pass may not be worth the effort. 2747 2748 // FIXME: When pairing store instructions it's very possible for this pass to 2749 // hoist a store with a KILL marker above another use (without a KILL marker). 2750 // The resulting IR is invalid, but nothing uses the KILL markers after this 2751 // pass, so it's never caused a problem in practice. 2752 2753 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 2754 /// load / store optimization pass. 2755 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 2756 return new AArch64LoadStoreOpt(); 2757 } 2758