1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass that performs load / store related peephole 10 // optimizations. This pass should be run after register allocation. 11 // 12 // The pass runs after the PrologEpilogInserter where we emit the CFI 13 // instructions. In order to preserve the correctness of the unwind informaiton, 14 // the pass should not change the order of any two instructions, one of which 15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix 16 // to unwind information. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "AArch64InstrInfo.h" 21 #include "AArch64MachineFunctionInfo.h" 22 #include "AArch64Subtarget.h" 23 #include "MCTargetDesc/AArch64AddressingModes.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineFunctionPass.h" 32 #include "llvm/CodeGen/MachineInstr.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/TargetRegisterInfo.h" 37 #include "llvm/IR/DebugLoc.h" 38 #include "llvm/MC/MCAsmInfo.h" 39 #include "llvm/MC/MCDwarf.h" 40 #include "llvm/MC/MCRegisterInfo.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/DebugCounter.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <functional> 50 #include <iterator> 51 #include <limits> 52 #include <optional> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "aarch64-ldst-opt" 57 58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 59 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 60 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 61 STATISTIC(NumUnscaledPairCreated, 62 "Number of load/store from unscaled generated"); 63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 65 66 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", 67 "Controls which pairs are considered for renaming"); 68 69 // The LdStLimit limits how far we search for load/store pairs. 70 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 71 cl::init(20), cl::Hidden); 72 73 // The UpdateLimit limits how far we search for update instructions when we form 74 // pre-/post-index instructions. 75 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 76 cl::Hidden); 77 78 // Enable register renaming to find additional store pairing opportunities. 79 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming", 80 cl::init(true), cl::Hidden); 81 82 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 83 84 namespace { 85 86 using LdStPairFlags = struct LdStPairFlags { 87 // If a matching instruction is found, MergeForward is set to true if the 88 // merge is to remove the first instruction and replace the second with 89 // a pair-wise insn, and false if the reverse is true. 90 bool MergeForward = false; 91 92 // SExtIdx gives the index of the result of the load pair that must be 93 // extended. The value of SExtIdx assumes that the paired load produces the 94 // value in this order: (I, returned iterator), i.e., -1 means no value has 95 // to be extended, 0 means I, and 1 means the returned iterator. 96 int SExtIdx = -1; 97 98 // If not none, RenameReg can be used to rename the result register of the 99 // first store in a pair. Currently this only works when merging stores 100 // forward. 101 std::optional<MCPhysReg> RenameReg; 102 103 LdStPairFlags() = default; 104 105 void setMergeForward(bool V = true) { MergeForward = V; } 106 bool getMergeForward() const { return MergeForward; } 107 108 void setSExtIdx(int V) { SExtIdx = V; } 109 int getSExtIdx() const { return SExtIdx; } 110 111 void setRenameReg(MCPhysReg R) { RenameReg = R; } 112 void clearRenameReg() { RenameReg = std::nullopt; } 113 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; } 114 }; 115 116 struct AArch64LoadStoreOpt : public MachineFunctionPass { 117 static char ID; 118 119 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 120 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 121 } 122 123 AliasAnalysis *AA; 124 const AArch64InstrInfo *TII; 125 const TargetRegisterInfo *TRI; 126 const AArch64Subtarget *Subtarget; 127 128 // Track which register units have been modified and used. 129 LiveRegUnits ModifiedRegUnits, UsedRegUnits; 130 LiveRegUnits DefinedInBB; 131 132 void getAnalysisUsage(AnalysisUsage &AU) const override { 133 AU.addRequired<AAResultsWrapperPass>(); 134 MachineFunctionPass::getAnalysisUsage(AU); 135 } 136 137 // Scan the instructions looking for a load/store that can be combined 138 // with the current instruction into a load/store pair. 139 // Return the matching instruction if one is found, else MBB->end(). 140 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 141 LdStPairFlags &Flags, 142 unsigned Limit, 143 bool FindNarrowMerge); 144 145 // Scan the instructions looking for a store that writes to the address from 146 // which the current load instruction reads. Return true if one is found. 147 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 148 MachineBasicBlock::iterator &StoreI); 149 150 // Merge the two instructions indicated into a wider narrow store instruction. 151 MachineBasicBlock::iterator 152 mergeNarrowZeroStores(MachineBasicBlock::iterator I, 153 MachineBasicBlock::iterator MergeMI, 154 const LdStPairFlags &Flags); 155 156 // Merge the two instructions indicated into a single pair-wise instruction. 157 MachineBasicBlock::iterator 158 mergePairedInsns(MachineBasicBlock::iterator I, 159 MachineBasicBlock::iterator Paired, 160 const LdStPairFlags &Flags); 161 162 // Promote the load that reads directly from the address stored to. 163 MachineBasicBlock::iterator 164 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 165 MachineBasicBlock::iterator StoreI); 166 167 // Scan the instruction list to find a base register update that can 168 // be combined with the current instruction (a load or store) using 169 // pre or post indexed addressing with writeback. Scan forwards. 170 MachineBasicBlock::iterator 171 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 172 int UnscaledOffset, unsigned Limit); 173 174 // Scan the instruction list to find a base register update that can 175 // be combined with the current instruction (a load or store) using 176 // pre or post indexed addressing with writeback. Scan backwards. 177 MachineBasicBlock::iterator 178 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 179 180 // Find an instruction that updates the base register of the ld/st 181 // instruction. 182 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, 183 unsigned BaseReg, int Offset); 184 185 // Merge a pre- or post-index base register update into a ld/st instruction. 186 MachineBasicBlock::iterator 187 mergeUpdateInsn(MachineBasicBlock::iterator I, 188 MachineBasicBlock::iterator Update, bool IsPreIdx); 189 190 // Find and merge zero store instructions. 191 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); 192 193 // Find and pair ldr/str instructions. 194 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 195 196 // Find and promote load instructions which read directly from store. 197 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 198 199 // Find and merge a base register updates before or after a ld/st instruction. 200 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); 201 202 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); 203 204 bool runOnMachineFunction(MachineFunction &Fn) override; 205 206 MachineFunctionProperties getRequiredProperties() const override { 207 return MachineFunctionProperties().set( 208 MachineFunctionProperties::Property::NoVRegs); 209 } 210 211 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } 212 }; 213 214 char AArch64LoadStoreOpt::ID = 0; 215 216 } // end anonymous namespace 217 218 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 219 AARCH64_LOAD_STORE_OPT_NAME, false, false) 220 221 static bool isNarrowStore(unsigned Opc) { 222 switch (Opc) { 223 default: 224 return false; 225 case AArch64::STRBBui: 226 case AArch64::STURBBi: 227 case AArch64::STRHHui: 228 case AArch64::STURHHi: 229 return true; 230 } 231 } 232 233 // These instruction set memory tag and either keep memory contents unchanged or 234 // set it to zero, ignoring the address part of the source register. 235 static bool isTagStore(const MachineInstr &MI) { 236 switch (MI.getOpcode()) { 237 default: 238 return false; 239 case AArch64::STGi: 240 case AArch64::STZGi: 241 case AArch64::ST2Gi: 242 case AArch64::STZ2Gi: 243 return true; 244 } 245 } 246 247 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 248 bool *IsValidLdStrOpc = nullptr) { 249 if (IsValidLdStrOpc) 250 *IsValidLdStrOpc = true; 251 switch (Opc) { 252 default: 253 if (IsValidLdStrOpc) 254 *IsValidLdStrOpc = false; 255 return std::numeric_limits<unsigned>::max(); 256 case AArch64::STRDui: 257 case AArch64::STURDi: 258 case AArch64::STRDpre: 259 case AArch64::STRQui: 260 case AArch64::STURQi: 261 case AArch64::STRQpre: 262 case AArch64::STRBBui: 263 case AArch64::STURBBi: 264 case AArch64::STRHHui: 265 case AArch64::STURHHi: 266 case AArch64::STRWui: 267 case AArch64::STRWpre: 268 case AArch64::STURWi: 269 case AArch64::STRXui: 270 case AArch64::STRXpre: 271 case AArch64::STURXi: 272 case AArch64::LDRDui: 273 case AArch64::LDURDi: 274 case AArch64::LDRDpre: 275 case AArch64::LDRQui: 276 case AArch64::LDURQi: 277 case AArch64::LDRQpre: 278 case AArch64::LDRWui: 279 case AArch64::LDURWi: 280 case AArch64::LDRWpre: 281 case AArch64::LDRXui: 282 case AArch64::LDURXi: 283 case AArch64::LDRXpre: 284 case AArch64::STRSui: 285 case AArch64::STURSi: 286 case AArch64::STRSpre: 287 case AArch64::LDRSui: 288 case AArch64::LDURSi: 289 case AArch64::LDRSpre: 290 return Opc; 291 case AArch64::LDRSWui: 292 return AArch64::LDRWui; 293 case AArch64::LDURSWi: 294 return AArch64::LDURWi; 295 case AArch64::LDRSWpre: 296 return AArch64::LDRWpre; 297 } 298 } 299 300 static unsigned getMatchingWideOpcode(unsigned Opc) { 301 switch (Opc) { 302 default: 303 llvm_unreachable("Opcode has no wide equivalent!"); 304 case AArch64::STRBBui: 305 return AArch64::STRHHui; 306 case AArch64::STRHHui: 307 return AArch64::STRWui; 308 case AArch64::STURBBi: 309 return AArch64::STURHHi; 310 case AArch64::STURHHi: 311 return AArch64::STURWi; 312 case AArch64::STURWi: 313 return AArch64::STURXi; 314 case AArch64::STRWui: 315 return AArch64::STRXui; 316 } 317 } 318 319 static unsigned getMatchingPairOpcode(unsigned Opc) { 320 switch (Opc) { 321 default: 322 llvm_unreachable("Opcode has no pairwise equivalent!"); 323 case AArch64::STRSui: 324 case AArch64::STURSi: 325 return AArch64::STPSi; 326 case AArch64::STRSpre: 327 return AArch64::STPSpre; 328 case AArch64::STRDui: 329 case AArch64::STURDi: 330 return AArch64::STPDi; 331 case AArch64::STRDpre: 332 return AArch64::STPDpre; 333 case AArch64::STRQui: 334 case AArch64::STURQi: 335 return AArch64::STPQi; 336 case AArch64::STRQpre: 337 return AArch64::STPQpre; 338 case AArch64::STRWui: 339 case AArch64::STURWi: 340 return AArch64::STPWi; 341 case AArch64::STRWpre: 342 return AArch64::STPWpre; 343 case AArch64::STRXui: 344 case AArch64::STURXi: 345 return AArch64::STPXi; 346 case AArch64::STRXpre: 347 return AArch64::STPXpre; 348 case AArch64::LDRSui: 349 case AArch64::LDURSi: 350 return AArch64::LDPSi; 351 case AArch64::LDRSpre: 352 return AArch64::LDPSpre; 353 case AArch64::LDRDui: 354 case AArch64::LDURDi: 355 return AArch64::LDPDi; 356 case AArch64::LDRDpre: 357 return AArch64::LDPDpre; 358 case AArch64::LDRQui: 359 case AArch64::LDURQi: 360 return AArch64::LDPQi; 361 case AArch64::LDRQpre: 362 return AArch64::LDPQpre; 363 case AArch64::LDRWui: 364 case AArch64::LDURWi: 365 return AArch64::LDPWi; 366 case AArch64::LDRWpre: 367 return AArch64::LDPWpre; 368 case AArch64::LDRXui: 369 case AArch64::LDURXi: 370 return AArch64::LDPXi; 371 case AArch64::LDRXpre: 372 return AArch64::LDPXpre; 373 case AArch64::LDRSWui: 374 case AArch64::LDURSWi: 375 return AArch64::LDPSWi; 376 case AArch64::LDRSWpre: 377 return AArch64::LDPSWpre; 378 } 379 } 380 381 static unsigned isMatchingStore(MachineInstr &LoadInst, 382 MachineInstr &StoreInst) { 383 unsigned LdOpc = LoadInst.getOpcode(); 384 unsigned StOpc = StoreInst.getOpcode(); 385 switch (LdOpc) { 386 default: 387 llvm_unreachable("Unsupported load instruction!"); 388 case AArch64::LDRBBui: 389 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 390 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 391 case AArch64::LDURBBi: 392 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 393 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 394 case AArch64::LDRHHui: 395 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 396 StOpc == AArch64::STRXui; 397 case AArch64::LDURHHi: 398 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 399 StOpc == AArch64::STURXi; 400 case AArch64::LDRWui: 401 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 402 case AArch64::LDURWi: 403 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 404 case AArch64::LDRXui: 405 return StOpc == AArch64::STRXui; 406 case AArch64::LDURXi: 407 return StOpc == AArch64::STURXi; 408 } 409 } 410 411 static unsigned getPreIndexedOpcode(unsigned Opc) { 412 // FIXME: We don't currently support creating pre-indexed loads/stores when 413 // the load or store is the unscaled version. If we decide to perform such an 414 // optimization in the future the cases for the unscaled loads/stores will 415 // need to be added here. 416 switch (Opc) { 417 default: 418 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 419 case AArch64::STRSui: 420 return AArch64::STRSpre; 421 case AArch64::STRDui: 422 return AArch64::STRDpre; 423 case AArch64::STRQui: 424 return AArch64::STRQpre; 425 case AArch64::STRBBui: 426 return AArch64::STRBBpre; 427 case AArch64::STRHHui: 428 return AArch64::STRHHpre; 429 case AArch64::STRWui: 430 return AArch64::STRWpre; 431 case AArch64::STRXui: 432 return AArch64::STRXpre; 433 case AArch64::LDRSui: 434 return AArch64::LDRSpre; 435 case AArch64::LDRDui: 436 return AArch64::LDRDpre; 437 case AArch64::LDRQui: 438 return AArch64::LDRQpre; 439 case AArch64::LDRBBui: 440 return AArch64::LDRBBpre; 441 case AArch64::LDRHHui: 442 return AArch64::LDRHHpre; 443 case AArch64::LDRWui: 444 return AArch64::LDRWpre; 445 case AArch64::LDRXui: 446 return AArch64::LDRXpre; 447 case AArch64::LDRSWui: 448 return AArch64::LDRSWpre; 449 case AArch64::LDPSi: 450 return AArch64::LDPSpre; 451 case AArch64::LDPSWi: 452 return AArch64::LDPSWpre; 453 case AArch64::LDPDi: 454 return AArch64::LDPDpre; 455 case AArch64::LDPQi: 456 return AArch64::LDPQpre; 457 case AArch64::LDPWi: 458 return AArch64::LDPWpre; 459 case AArch64::LDPXi: 460 return AArch64::LDPXpre; 461 case AArch64::STPSi: 462 return AArch64::STPSpre; 463 case AArch64::STPDi: 464 return AArch64::STPDpre; 465 case AArch64::STPQi: 466 return AArch64::STPQpre; 467 case AArch64::STPWi: 468 return AArch64::STPWpre; 469 case AArch64::STPXi: 470 return AArch64::STPXpre; 471 case AArch64::STGi: 472 return AArch64::STGPreIndex; 473 case AArch64::STZGi: 474 return AArch64::STZGPreIndex; 475 case AArch64::ST2Gi: 476 return AArch64::ST2GPreIndex; 477 case AArch64::STZ2Gi: 478 return AArch64::STZ2GPreIndex; 479 case AArch64::STGPi: 480 return AArch64::STGPpre; 481 } 482 } 483 484 static unsigned getPostIndexedOpcode(unsigned Opc) { 485 switch (Opc) { 486 default: 487 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 488 case AArch64::STRSui: 489 case AArch64::STURSi: 490 return AArch64::STRSpost; 491 case AArch64::STRDui: 492 case AArch64::STURDi: 493 return AArch64::STRDpost; 494 case AArch64::STRQui: 495 case AArch64::STURQi: 496 return AArch64::STRQpost; 497 case AArch64::STRBBui: 498 return AArch64::STRBBpost; 499 case AArch64::STRHHui: 500 return AArch64::STRHHpost; 501 case AArch64::STRWui: 502 case AArch64::STURWi: 503 return AArch64::STRWpost; 504 case AArch64::STRXui: 505 case AArch64::STURXi: 506 return AArch64::STRXpost; 507 case AArch64::LDRSui: 508 case AArch64::LDURSi: 509 return AArch64::LDRSpost; 510 case AArch64::LDRDui: 511 case AArch64::LDURDi: 512 return AArch64::LDRDpost; 513 case AArch64::LDRQui: 514 case AArch64::LDURQi: 515 return AArch64::LDRQpost; 516 case AArch64::LDRBBui: 517 return AArch64::LDRBBpost; 518 case AArch64::LDRHHui: 519 return AArch64::LDRHHpost; 520 case AArch64::LDRWui: 521 case AArch64::LDURWi: 522 return AArch64::LDRWpost; 523 case AArch64::LDRXui: 524 case AArch64::LDURXi: 525 return AArch64::LDRXpost; 526 case AArch64::LDRSWui: 527 return AArch64::LDRSWpost; 528 case AArch64::LDPSi: 529 return AArch64::LDPSpost; 530 case AArch64::LDPSWi: 531 return AArch64::LDPSWpost; 532 case AArch64::LDPDi: 533 return AArch64::LDPDpost; 534 case AArch64::LDPQi: 535 return AArch64::LDPQpost; 536 case AArch64::LDPWi: 537 return AArch64::LDPWpost; 538 case AArch64::LDPXi: 539 return AArch64::LDPXpost; 540 case AArch64::STPSi: 541 return AArch64::STPSpost; 542 case AArch64::STPDi: 543 return AArch64::STPDpost; 544 case AArch64::STPQi: 545 return AArch64::STPQpost; 546 case AArch64::STPWi: 547 return AArch64::STPWpost; 548 case AArch64::STPXi: 549 return AArch64::STPXpost; 550 case AArch64::STGi: 551 return AArch64::STGPostIndex; 552 case AArch64::STZGi: 553 return AArch64::STZGPostIndex; 554 case AArch64::ST2Gi: 555 return AArch64::ST2GPostIndex; 556 case AArch64::STZ2Gi: 557 return AArch64::STZ2GPostIndex; 558 case AArch64::STGPi: 559 return AArch64::STGPpost; 560 } 561 } 562 563 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { 564 565 unsigned OpcA = FirstMI.getOpcode(); 566 unsigned OpcB = MI.getOpcode(); 567 568 switch (OpcA) { 569 default: 570 return false; 571 case AArch64::STRSpre: 572 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi); 573 case AArch64::STRDpre: 574 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi); 575 case AArch64::STRQpre: 576 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi); 577 case AArch64::STRWpre: 578 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi); 579 case AArch64::STRXpre: 580 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi); 581 case AArch64::LDRSpre: 582 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi); 583 case AArch64::LDRDpre: 584 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi); 585 case AArch64::LDRQpre: 586 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi); 587 case AArch64::LDRWpre: 588 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi); 589 case AArch64::LDRXpre: 590 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi); 591 case AArch64::LDRSWpre: 592 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi); 593 } 594 } 595 596 // Returns the scale and offset range of pre/post indexed variants of MI. 597 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, 598 int &MinOffset, int &MaxOffset) { 599 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); 600 bool IsTagStore = isTagStore(MI); 601 // ST*G and all paired ldst have the same scale in pre/post-indexed variants 602 // as in the "unsigned offset" variant. 603 // All other pre/post indexed ldst instructions are unscaled. 604 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; 605 606 if (IsPaired) { 607 MinOffset = -64; 608 MaxOffset = 63; 609 } else { 610 MinOffset = -256; 611 MaxOffset = 255; 612 } 613 } 614 615 static MachineOperand &getLdStRegOp(MachineInstr &MI, 616 unsigned PairedRegOp = 0) { 617 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 618 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); 619 if (IsPreLdSt) 620 PairedRegOp += 1; 621 unsigned Idx = 622 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; 623 return MI.getOperand(Idx); 624 } 625 626 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, 627 MachineInstr &StoreInst, 628 const AArch64InstrInfo *TII) { 629 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 630 int LoadSize = TII->getMemScale(LoadInst); 631 int StoreSize = TII->getMemScale(StoreInst); 632 int UnscaledStOffset = 633 TII->hasUnscaledLdStOffset(StoreInst) 634 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() 635 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; 636 int UnscaledLdOffset = 637 TII->hasUnscaledLdStOffset(LoadInst) 638 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() 639 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; 640 return (UnscaledStOffset <= UnscaledLdOffset) && 641 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 642 } 643 644 static bool isPromotableZeroStoreInst(MachineInstr &MI) { 645 unsigned Opc = MI.getOpcode(); 646 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || 647 isNarrowStore(Opc)) && 648 getLdStRegOp(MI).getReg() == AArch64::WZR; 649 } 650 651 static bool isPromotableLoadFromStore(MachineInstr &MI) { 652 switch (MI.getOpcode()) { 653 default: 654 return false; 655 // Scaled instructions. 656 case AArch64::LDRBBui: 657 case AArch64::LDRHHui: 658 case AArch64::LDRWui: 659 case AArch64::LDRXui: 660 // Unscaled instructions. 661 case AArch64::LDURBBi: 662 case AArch64::LDURHHi: 663 case AArch64::LDURWi: 664 case AArch64::LDURXi: 665 return true; 666 } 667 } 668 669 static bool isMergeableLdStUpdate(MachineInstr &MI) { 670 unsigned Opc = MI.getOpcode(); 671 switch (Opc) { 672 default: 673 return false; 674 // Scaled instructions. 675 case AArch64::STRSui: 676 case AArch64::STRDui: 677 case AArch64::STRQui: 678 case AArch64::STRXui: 679 case AArch64::STRWui: 680 case AArch64::STRHHui: 681 case AArch64::STRBBui: 682 case AArch64::LDRSui: 683 case AArch64::LDRDui: 684 case AArch64::LDRQui: 685 case AArch64::LDRXui: 686 case AArch64::LDRWui: 687 case AArch64::LDRHHui: 688 case AArch64::LDRBBui: 689 case AArch64::STGi: 690 case AArch64::STZGi: 691 case AArch64::ST2Gi: 692 case AArch64::STZ2Gi: 693 case AArch64::STGPi: 694 // Unscaled instructions. 695 case AArch64::STURSi: 696 case AArch64::STURDi: 697 case AArch64::STURQi: 698 case AArch64::STURWi: 699 case AArch64::STURXi: 700 case AArch64::LDURSi: 701 case AArch64::LDURDi: 702 case AArch64::LDURQi: 703 case AArch64::LDURWi: 704 case AArch64::LDURXi: 705 // Paired instructions. 706 case AArch64::LDPSi: 707 case AArch64::LDPSWi: 708 case AArch64::LDPDi: 709 case AArch64::LDPQi: 710 case AArch64::LDPWi: 711 case AArch64::LDPXi: 712 case AArch64::STPSi: 713 case AArch64::STPDi: 714 case AArch64::STPQi: 715 case AArch64::STPWi: 716 case AArch64::STPXi: 717 // Make sure this is a reg+imm (as opposed to an address reloc). 718 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 719 return false; 720 721 return true; 722 } 723 } 724 725 static bool isRewritableImplicitDef(unsigned Opc) { 726 switch (Opc) { 727 default: 728 return false; 729 case AArch64::ORRWrs: 730 case AArch64::ADDWri: 731 return true; 732 } 733 } 734 735 MachineBasicBlock::iterator 736 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, 737 MachineBasicBlock::iterator MergeMI, 738 const LdStPairFlags &Flags) { 739 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && 740 "Expected promotable zero stores."); 741 742 MachineBasicBlock::iterator E = I->getParent()->end(); 743 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 744 // If NextI is the second of the two instructions to be merged, we need 745 // to skip one further. Either way we merge will invalidate the iterator, 746 // and we don't need to scan the new instruction, as it's a pairwise 747 // instruction, which we're not considering for further action anyway. 748 if (NextI == MergeMI) 749 NextI = next_nodbg(NextI, E); 750 751 unsigned Opc = I->getOpcode(); 752 unsigned MergeMIOpc = MergeMI->getOpcode(); 753 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc); 754 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc); 755 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1; 756 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1; 757 758 bool MergeForward = Flags.getMergeForward(); 759 // Insert our new paired instruction after whichever of the paired 760 // instructions MergeForward indicates. 761 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 762 // Also based on MergeForward is from where we copy the base register operand 763 // so we get the flags compatible with the input code. 764 const MachineOperand &BaseRegOp = 765 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) 766 : AArch64InstrInfo::getLdStBaseOp(*I); 767 768 // Which register is Rt and which is Rt2 depends on the offset order. 769 int64_t IOffsetInBytes = 770 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride; 771 int64_t MIOffsetInBytes = 772 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() * 773 MergeMIOffsetStride; 774 // Select final offset based on the offset order. 775 int64_t OffsetImm; 776 if (IOffsetInBytes > MIOffsetInBytes) 777 OffsetImm = MIOffsetInBytes; 778 else 779 OffsetImm = IOffsetInBytes; 780 781 int NewOpcode = getMatchingWideOpcode(Opc); 782 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode); 783 784 // Adjust final offset if the result opcode is a scaled store. 785 if (FinalIsScaled) { 786 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1; 787 assert(((OffsetImm % NewOffsetStride) == 0) && 788 "Offset should be a multiple of the store memory scale"); 789 OffsetImm = OffsetImm / NewOffsetStride; 790 } 791 792 // Construct the new instruction. 793 DebugLoc DL = I->getDebugLoc(); 794 MachineBasicBlock *MBB = I->getParent(); 795 MachineInstrBuilder MIB; 796 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 797 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 798 .add(BaseRegOp) 799 .addImm(OffsetImm) 800 .cloneMergedMemRefs({&*I, &*MergeMI}) 801 .setMIFlags(I->mergeFlagsWith(*MergeMI)); 802 (void)MIB; 803 804 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); 805 LLVM_DEBUG(I->print(dbgs())); 806 LLVM_DEBUG(dbgs() << " "); 807 LLVM_DEBUG(MergeMI->print(dbgs())); 808 LLVM_DEBUG(dbgs() << " with instruction:\n "); 809 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 810 LLVM_DEBUG(dbgs() << "\n"); 811 812 // Erase the old instructions. 813 I->eraseFromParent(); 814 MergeMI->eraseFromParent(); 815 return NextI; 816 } 817 818 // Apply Fn to all instructions between MI and the beginning of the block, until 819 // a def for DefReg is reached. Returns true, iff Fn returns true for all 820 // visited instructions. Stop after visiting Limit iterations. 821 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, 822 const TargetRegisterInfo *TRI, unsigned Limit, 823 std::function<bool(MachineInstr &, bool)> &Fn) { 824 auto MBB = MI.getParent(); 825 for (MachineInstr &I : 826 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { 827 if (!Limit) 828 return false; 829 --Limit; 830 831 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { 832 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && 833 TRI->regsOverlap(MOP.getReg(), DefReg); 834 }); 835 if (!Fn(I, isDef)) 836 return false; 837 if (isDef) 838 break; 839 } 840 return true; 841 } 842 843 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, 844 const TargetRegisterInfo *TRI) { 845 846 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 847 if (MOP.isReg() && MOP.isKill()) 848 Units.removeReg(MOP.getReg()); 849 850 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 851 if (MOP.isReg() && !MOP.isKill()) 852 Units.addReg(MOP.getReg()); 853 } 854 855 MachineBasicBlock::iterator 856 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 857 MachineBasicBlock::iterator Paired, 858 const LdStPairFlags &Flags) { 859 MachineBasicBlock::iterator E = I->getParent()->end(); 860 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 861 // If NextI is the second of the two instructions to be merged, we need 862 // to skip one further. Either way we merge will invalidate the iterator, 863 // and we don't need to scan the new instruction, as it's a pairwise 864 // instruction, which we're not considering for further action anyway. 865 if (NextI == Paired) 866 NextI = next_nodbg(NextI, E); 867 868 int SExtIdx = Flags.getSExtIdx(); 869 unsigned Opc = 870 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 871 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc); 872 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; 873 874 bool MergeForward = Flags.getMergeForward(); 875 876 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg(); 877 if (RenameReg) { 878 MCRegister RegToRename = getLdStRegOp(*I).getReg(); 879 DefinedInBB.addReg(*RenameReg); 880 881 // Return the sub/super register for RenameReg, matching the size of 882 // OriginalReg. 883 auto GetMatchingSubReg = 884 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg { 885 for (MCPhysReg SubOrSuper : 886 TRI->sub_and_superregs_inclusive(*RenameReg)) { 887 if (C->contains(SubOrSuper)) 888 return SubOrSuper; 889 } 890 llvm_unreachable("Should have found matching sub or super register!"); 891 }; 892 893 std::function<bool(MachineInstr &, bool)> UpdateMIs = 894 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI, 895 bool IsDef) { 896 if (IsDef) { 897 bool SeenDef = false; 898 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 899 MachineOperand &MOP = MI.getOperand(OpIdx); 900 // Rename the first explicit definition and all implicit 901 // definitions matching RegToRename. 902 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 903 (!MergeForward || !SeenDef || 904 (MOP.isDef() && MOP.isImplicit())) && 905 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 906 assert((MOP.isImplicit() || 907 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 908 "Need renamable operands"); 909 Register MatchingReg; 910 if (const TargetRegisterClass *RC = 911 MI.getRegClassConstraint(OpIdx, TII, TRI)) 912 MatchingReg = GetMatchingSubReg(RC); 913 else { 914 if (!isRewritableImplicitDef(MI.getOpcode())) 915 continue; 916 MatchingReg = GetMatchingSubReg( 917 TRI->getMinimalPhysRegClass(MOP.getReg())); 918 } 919 MOP.setReg(MatchingReg); 920 SeenDef = true; 921 } 922 } 923 } else { 924 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 925 MachineOperand &MOP = MI.getOperand(OpIdx); 926 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 927 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 928 assert((MOP.isImplicit() || 929 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 930 "Need renamable operands"); 931 Register MatchingReg; 932 if (const TargetRegisterClass *RC = 933 MI.getRegClassConstraint(OpIdx, TII, TRI)) 934 MatchingReg = GetMatchingSubReg(RC); 935 else 936 MatchingReg = GetMatchingSubReg( 937 TRI->getMinimalPhysRegClass(MOP.getReg())); 938 assert(MatchingReg != AArch64::NoRegister && 939 "Cannot find matching regs for renaming"); 940 MOP.setReg(MatchingReg); 941 } 942 } 943 } 944 LLVM_DEBUG(dbgs() << "Renamed " << MI); 945 return true; 946 }; 947 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI, 948 UINT32_MAX, UpdateMIs); 949 950 #if !defined(NDEBUG) 951 // For forward merging store: 952 // Make sure the register used for renaming is not used between the 953 // paired instructions. That would trash the content before the new 954 // paired instruction. 955 MCPhysReg RegToCheck = *RenameReg; 956 // For backward merging load: 957 // Make sure the register being renamed is not used between the 958 // paired instructions. That would trash the content after the new 959 // paired instruction. 960 if (!MergeForward) 961 RegToCheck = RegToRename; 962 for (auto &MI : 963 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>( 964 MergeForward ? std::next(I) : I, 965 MergeForward ? std::next(Paired) : Paired)) 966 assert(all_of(MI.operands(), 967 [this, RegToCheck](const MachineOperand &MOP) { 968 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 969 MOP.isUndef() || 970 !TRI->regsOverlap(MOP.getReg(), RegToCheck); 971 }) && 972 "Rename register used between paired instruction, trashing the " 973 "content"); 974 #endif 975 } 976 977 // Insert our new paired instruction after whichever of the paired 978 // instructions MergeForward indicates. 979 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 980 // Also based on MergeForward is from where we copy the base register operand 981 // so we get the flags compatible with the input code. 982 const MachineOperand &BaseRegOp = 983 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) 984 : AArch64InstrInfo::getLdStBaseOp(*I); 985 986 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); 987 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); 988 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); 989 if (IsUnscaled != PairedIsUnscaled) { 990 // We're trying to pair instructions that differ in how they are scaled. If 991 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 992 // the opposite (i.e., make Paired's offset unscaled). 993 int MemSize = TII->getMemScale(*Paired); 994 if (PairedIsUnscaled) { 995 // If the unscaled offset isn't a multiple of the MemSize, we can't 996 // pair the operations together. 997 assert(!(PairedOffset % TII->getMemScale(*Paired)) && 998 "Offset should be a multiple of the stride!"); 999 PairedOffset /= MemSize; 1000 } else { 1001 PairedOffset *= MemSize; 1002 } 1003 } 1004 1005 // Which register is Rt and which is Rt2 depends on the offset order. 1006 // However, for pre load/stores the Rt should be the one of the pre 1007 // load/store. 1008 MachineInstr *RtMI, *Rt2MI; 1009 if (Offset == PairedOffset + OffsetStride && 1010 !AArch64InstrInfo::isPreLdSt(*I)) { 1011 RtMI = &*Paired; 1012 Rt2MI = &*I; 1013 // Here we swapped the assumption made for SExtIdx. 1014 // I.e., we turn ldp I, Paired into ldp Paired, I. 1015 // Update the index accordingly. 1016 if (SExtIdx != -1) 1017 SExtIdx = (SExtIdx + 1) % 2; 1018 } else { 1019 RtMI = &*I; 1020 Rt2MI = &*Paired; 1021 } 1022 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); 1023 // Scale the immediate offset, if necessary. 1024 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { 1025 assert(!(OffsetImm % TII->getMemScale(*RtMI)) && 1026 "Unscaled offset cannot be scaled."); 1027 OffsetImm /= TII->getMemScale(*RtMI); 1028 } 1029 1030 // Construct the new instruction. 1031 MachineInstrBuilder MIB; 1032 DebugLoc DL = I->getDebugLoc(); 1033 MachineBasicBlock *MBB = I->getParent(); 1034 MachineOperand RegOp0 = getLdStRegOp(*RtMI); 1035 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); 1036 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1; 1037 // Kill flags may become invalid when moving stores for pairing. 1038 if (RegOp0.isUse()) { 1039 if (!MergeForward) { 1040 // Clear kill flags on store if moving upwards. Example: 1041 // STRWui kill %w0, ... 1042 // USE %w1 1043 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards 1044 // We are about to move the store of w1, so its kill flag may become 1045 // invalid; not the case for w0. 1046 // Since w1 is used between the stores, the kill flag on w1 is cleared 1047 // after merging. 1048 // STPWi kill %w0, %w1, ... 1049 // USE %w1 1050 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It) 1051 if (It->readsRegister(PairedRegOp.getReg(), TRI)) 1052 PairedRegOp.setIsKill(false); 1053 } else { 1054 // Clear kill flags of the first stores register. Example: 1055 // STRWui %w1, ... 1056 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards 1057 // STRW %w0 1058 Register Reg = getLdStRegOp(*I).getReg(); 1059 for (MachineInstr &MI : make_range(std::next(I), Paired)) 1060 MI.clearRegisterKills(Reg, TRI); 1061 } 1062 } 1063 1064 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc); 1065 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode)); 1066 1067 // Adds the pre-index operand for pre-indexed ld/st pairs. 1068 if (AArch64InstrInfo::isPreLdSt(*RtMI)) 1069 MIB.addReg(BaseRegOp.getReg(), RegState::Define); 1070 1071 MIB.add(RegOp0) 1072 .add(RegOp1) 1073 .add(BaseRegOp) 1074 .addImm(OffsetImm) 1075 .cloneMergedMemRefs({&*I, &*Paired}) 1076 .setMIFlags(I->mergeFlagsWith(*Paired)); 1077 1078 (void)MIB; 1079 1080 LLVM_DEBUG( 1081 dbgs() << "Creating pair load/store. Replacing instructions:\n "); 1082 LLVM_DEBUG(I->print(dbgs())); 1083 LLVM_DEBUG(dbgs() << " "); 1084 LLVM_DEBUG(Paired->print(dbgs())); 1085 LLVM_DEBUG(dbgs() << " with instruction:\n "); 1086 if (SExtIdx != -1) { 1087 // Generate the sign extension for the proper result of the ldp. 1088 // I.e., with X1, that would be: 1089 // %w1 = KILL %w1, implicit-def %x1 1090 // %x1 = SBFMXri killed %x1, 0, 31 1091 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 1092 // Right now, DstMO has the extended register, since it comes from an 1093 // extended opcode. 1094 Register DstRegX = DstMO.getReg(); 1095 // Get the W variant of that register. 1096 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 1097 // Update the result of LDP to use the W instead of the X variant. 1098 DstMO.setReg(DstRegW); 1099 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1100 LLVM_DEBUG(dbgs() << "\n"); 1101 // Make the machine verifier happy by providing a definition for 1102 // the X register. 1103 // Insert this definition right after the generated LDP, i.e., before 1104 // InsertionPoint. 1105 MachineInstrBuilder MIBKill = 1106 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 1107 .addReg(DstRegW) 1108 .addReg(DstRegX, RegState::Define); 1109 MIBKill->getOperand(2).setImplicit(); 1110 // Create the sign extension. 1111 MachineInstrBuilder MIBSXTW = 1112 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 1113 .addReg(DstRegX) 1114 .addImm(0) 1115 .addImm(31); 1116 (void)MIBSXTW; 1117 LLVM_DEBUG(dbgs() << " Extend operand:\n "); 1118 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 1119 } else { 1120 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1121 } 1122 LLVM_DEBUG(dbgs() << "\n"); 1123 1124 if (MergeForward) 1125 for (const MachineOperand &MOP : phys_regs_and_masks(*I)) 1126 if (MOP.isReg() && MOP.isKill()) 1127 DefinedInBB.addReg(MOP.getReg()); 1128 1129 // Erase the old instructions. 1130 I->eraseFromParent(); 1131 Paired->eraseFromParent(); 1132 1133 return NextI; 1134 } 1135 1136 MachineBasicBlock::iterator 1137 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 1138 MachineBasicBlock::iterator StoreI) { 1139 MachineBasicBlock::iterator NextI = 1140 next_nodbg(LoadI, LoadI->getParent()->end()); 1141 1142 int LoadSize = TII->getMemScale(*LoadI); 1143 int StoreSize = TII->getMemScale(*StoreI); 1144 Register LdRt = getLdStRegOp(*LoadI).getReg(); 1145 const MachineOperand &StMO = getLdStRegOp(*StoreI); 1146 Register StRt = getLdStRegOp(*StoreI).getReg(); 1147 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 1148 1149 assert((IsStoreXReg || 1150 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 1151 "Unexpected RegClass"); 1152 1153 MachineInstr *BitExtMI; 1154 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 1155 // Remove the load, if the destination register of the loads is the same 1156 // register for stored value. 1157 if (StRt == LdRt && LoadSize == 8) { 1158 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1159 LoadI->getIterator())) { 1160 if (MI.killsRegister(StRt, TRI)) { 1161 MI.clearRegisterKills(StRt, TRI); 1162 break; 1163 } 1164 } 1165 LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); 1166 LLVM_DEBUG(LoadI->print(dbgs())); 1167 LLVM_DEBUG(dbgs() << "\n"); 1168 LoadI->eraseFromParent(); 1169 return NextI; 1170 } 1171 // Replace the load with a mov if the load and store are in the same size. 1172 BitExtMI = 1173 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1174 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 1175 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 1176 .add(StMO) 1177 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 1178 .setMIFlags(LoadI->getFlags()); 1179 } else { 1180 // FIXME: Currently we disable this transformation in big-endian targets as 1181 // performance and correctness are verified only in little-endian. 1182 if (!Subtarget->isLittleEndian()) 1183 return NextI; 1184 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI); 1185 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && 1186 "Unsupported ld/st match"); 1187 assert(LoadSize <= StoreSize && "Invalid load size"); 1188 int UnscaledLdOffset = 1189 IsUnscaled 1190 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() 1191 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; 1192 int UnscaledStOffset = 1193 IsUnscaled 1194 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() 1195 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; 1196 int Width = LoadSize * 8; 1197 Register DestReg = 1198 IsStoreXReg ? Register(TRI->getMatchingSuperReg( 1199 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) 1200 : LdRt; 1201 1202 assert((UnscaledLdOffset >= UnscaledStOffset && 1203 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 1204 "Invalid offset"); 1205 1206 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 1207 int Imms = Immr + Width - 1; 1208 if (UnscaledLdOffset == UnscaledStOffset) { 1209 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 1210 | ((Immr) << 6) // immr 1211 | ((Imms) << 0) // imms 1212 ; 1213 1214 BitExtMI = 1215 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1216 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 1217 DestReg) 1218 .add(StMO) 1219 .addImm(AndMaskEncoded) 1220 .setMIFlags(LoadI->getFlags()); 1221 } else { 1222 BitExtMI = 1223 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1224 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 1225 DestReg) 1226 .add(StMO) 1227 .addImm(Immr) 1228 .addImm(Imms) 1229 .setMIFlags(LoadI->getFlags()); 1230 } 1231 } 1232 1233 // Clear kill flags between store and load. 1234 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1235 BitExtMI->getIterator())) 1236 if (MI.killsRegister(StRt, TRI)) { 1237 MI.clearRegisterKills(StRt, TRI); 1238 break; 1239 } 1240 1241 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); 1242 LLVM_DEBUG(StoreI->print(dbgs())); 1243 LLVM_DEBUG(dbgs() << " "); 1244 LLVM_DEBUG(LoadI->print(dbgs())); 1245 LLVM_DEBUG(dbgs() << " with instructions:\n "); 1246 LLVM_DEBUG(StoreI->print(dbgs())); 1247 LLVM_DEBUG(dbgs() << " "); 1248 LLVM_DEBUG((BitExtMI)->print(dbgs())); 1249 LLVM_DEBUG(dbgs() << "\n"); 1250 1251 // Erase the old instructions. 1252 LoadI->eraseFromParent(); 1253 return NextI; 1254 } 1255 1256 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1257 // Convert the byte-offset used by unscaled into an "element" offset used 1258 // by the scaled pair load/store instructions. 1259 if (IsUnscaled) { 1260 // If the byte-offset isn't a multiple of the stride, there's no point 1261 // trying to match it. 1262 if (Offset % OffsetStride) 1263 return false; 1264 Offset /= OffsetStride; 1265 } 1266 return Offset <= 63 && Offset >= -64; 1267 } 1268 1269 // Do alignment, specialized to power of 2 and for signed ints, 1270 // avoiding having to do a C-style cast from uint_64t to int when 1271 // using alignTo from include/llvm/Support/MathExtras.h. 1272 // FIXME: Move this function to include/MathExtras.h? 1273 static int alignTo(int Num, int PowOf2) { 1274 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1275 } 1276 1277 static bool mayAlias(MachineInstr &MIa, 1278 SmallVectorImpl<MachineInstr *> &MemInsns, 1279 AliasAnalysis *AA) { 1280 for (MachineInstr *MIb : MemInsns) { 1281 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) { 1282 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump()); 1283 return true; 1284 } 1285 } 1286 1287 LLVM_DEBUG(dbgs() << "No aliases found\n"); 1288 return false; 1289 } 1290 1291 bool AArch64LoadStoreOpt::findMatchingStore( 1292 MachineBasicBlock::iterator I, unsigned Limit, 1293 MachineBasicBlock::iterator &StoreI) { 1294 MachineBasicBlock::iterator B = I->getParent()->begin(); 1295 MachineBasicBlock::iterator MBBI = I; 1296 MachineInstr &LoadMI = *I; 1297 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); 1298 1299 // If the load is the first instruction in the block, there's obviously 1300 // not any matching store. 1301 if (MBBI == B) 1302 return false; 1303 1304 // Track which register units have been modified and used between the first 1305 // insn and the second insn. 1306 ModifiedRegUnits.clear(); 1307 UsedRegUnits.clear(); 1308 1309 unsigned Count = 0; 1310 do { 1311 MBBI = prev_nodbg(MBBI, B); 1312 MachineInstr &MI = *MBBI; 1313 1314 // Don't count transient instructions towards the search limit since there 1315 // may be different numbers of them if e.g. debug information is present. 1316 if (!MI.isTransient()) 1317 ++Count; 1318 1319 // If the load instruction reads directly from the address to which the 1320 // store instruction writes and the stored value is not modified, we can 1321 // promote the load. Since we do not handle stores with pre-/post-index, 1322 // it's unnecessary to check if BaseReg is modified by the store itself. 1323 // Also we can't handle stores without an immediate offset operand, 1324 // while the operand might be the address for a global variable. 1325 if (MI.mayStore() && isMatchingStore(LoadMI, MI) && 1326 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && 1327 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && 1328 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1329 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { 1330 StoreI = MBBI; 1331 return true; 1332 } 1333 1334 if (MI.isCall()) 1335 return false; 1336 1337 // Update modified / uses register units. 1338 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1339 1340 // Otherwise, if the base register is modified, we have no match, so 1341 // return early. 1342 if (!ModifiedRegUnits.available(BaseReg)) 1343 return false; 1344 1345 // If we encounter a store aliased with the load, return early. 1346 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) 1347 return false; 1348 } while (MBBI != B && Count < Limit); 1349 return false; 1350 } 1351 1352 static bool needsWinCFI(const MachineFunction *MF) { 1353 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && 1354 MF->getFunction().needsUnwindTableEntry(); 1355 } 1356 1357 // Returns true if FirstMI and MI are candidates for merging or pairing. 1358 // Otherwise, returns false. 1359 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, 1360 LdStPairFlags &Flags, 1361 const AArch64InstrInfo *TII) { 1362 // If this is volatile or if pairing is suppressed, not a candidate. 1363 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1364 return false; 1365 1366 // We should have already checked FirstMI for pair suppression and volatility. 1367 assert(!FirstMI.hasOrderedMemoryRef() && 1368 !TII->isLdStPairSuppressed(FirstMI) && 1369 "FirstMI shouldn't get here if either of these checks are true."); 1370 1371 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) || 1372 MI.getFlag(MachineInstr::FrameDestroy))) 1373 return false; 1374 1375 unsigned OpcA = FirstMI.getOpcode(); 1376 unsigned OpcB = MI.getOpcode(); 1377 1378 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check. 1379 if (OpcA == OpcB) 1380 return !AArch64InstrInfo::isPreLdSt(FirstMI); 1381 1382 // Two pre ld/st of different opcodes cannot be merged either 1383 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI)) 1384 return false; 1385 1386 // Try to match a sign-extended load/store with a zero-extended load/store. 1387 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1388 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1389 assert(IsValidLdStrOpc && 1390 "Given Opc should be a Load or Store with an immediate"); 1391 // OpcA will be the first instruction in the pair. 1392 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1393 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1394 return true; 1395 } 1396 1397 // If the second instruction isn't even a mergable/pairable load/store, bail 1398 // out. 1399 if (!PairIsValidLdStrOpc) 1400 return false; 1401 1402 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled 1403 // offsets. 1404 if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) 1405 return false; 1406 1407 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and 1408 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui 1409 // are candidate pairs that can be merged. 1410 if (isPreLdStPairCandidate(FirstMI, MI)) 1411 return true; 1412 1413 // Try to match an unscaled load/store with a scaled load/store. 1414 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) && 1415 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1416 1417 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1418 } 1419 1420 static bool canRenameMOP(const MachineOperand &MOP, 1421 const TargetRegisterInfo *TRI) { 1422 if (MOP.isReg()) { 1423 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); 1424 // Renaming registers with multiple disjunct sub-registers (e.g. the 1425 // result of a LD3) means that all sub-registers are renamed, potentially 1426 // impacting other instructions we did not check. Bail out. 1427 // Note that this relies on the structure of the AArch64 register file. In 1428 // particular, a subregister cannot be written without overwriting the 1429 // whole register. 1430 if (RegClass->HasDisjunctSubRegs) { 1431 LLVM_DEBUG( 1432 dbgs() 1433 << " Cannot rename operands with multiple disjunct subregisters (" 1434 << MOP << ")\n"); 1435 return false; 1436 } 1437 1438 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite 1439 // them must be known. For example, in ORRWrs the implicit-def 1440 // corresponds to the result register. 1441 if (MOP.isImplicit() && MOP.isDef()) { 1442 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode())) 1443 return false; 1444 return TRI->isSuperOrSubRegisterEq( 1445 MOP.getParent()->getOperand(0).getReg(), MOP.getReg()); 1446 } 1447 } 1448 return MOP.isImplicit() || 1449 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); 1450 } 1451 1452 static bool 1453 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, 1454 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1455 const TargetRegisterInfo *TRI) { 1456 if (!FirstMI.mayStore()) 1457 return false; 1458 1459 // Check if we can find an unused register which we can use to rename 1460 // the register used by the first load/store. 1461 1462 auto RegToRename = getLdStRegOp(FirstMI).getReg(); 1463 // For now, we only rename if the store operand gets killed at the store. 1464 if (!getLdStRegOp(FirstMI).isKill() && 1465 !any_of(FirstMI.operands(), 1466 [TRI, RegToRename](const MachineOperand &MOP) { 1467 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 1468 MOP.isImplicit() && MOP.isKill() && 1469 TRI->regsOverlap(RegToRename, MOP.getReg()); 1470 })) { 1471 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI); 1472 return false; 1473 } 1474 1475 bool FoundDef = false; 1476 1477 // For each instruction between FirstMI and the previous def for RegToRename, 1478 // we 1479 // * check if we can rename RegToRename in this instruction 1480 // * collect the registers used and required register classes for RegToRename. 1481 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI, 1482 bool IsDef) { 1483 LLVM_DEBUG(dbgs() << "Checking " << MI); 1484 // Currently we do not try to rename across frame-setup instructions. 1485 if (MI.getFlag(MachineInstr::FrameSetup)) { 1486 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1487 << "currently\n"); 1488 return false; 1489 } 1490 1491 UsedInBetween.accumulate(MI); 1492 1493 // For a definition, check that we can rename the definition and exit the 1494 // loop. 1495 FoundDef = IsDef; 1496 1497 // For defs, check if we can rename the first def of RegToRename. 1498 if (FoundDef) { 1499 // For some pseudo instructions, we might not generate code in the end 1500 // (e.g. KILL) and we would end up without a correct def for the rename 1501 // register. 1502 // TODO: This might be overly conservative and we could handle those cases 1503 // in multiple ways: 1504 // 1. Insert an extra copy, to materialize the def. 1505 // 2. Skip pseudo-defs until we find an non-pseudo def. 1506 if (MI.isPseudo()) { 1507 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n"); 1508 return false; 1509 } 1510 1511 for (auto &MOP : MI.operands()) { 1512 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || 1513 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1514 continue; 1515 if (!canRenameMOP(MOP, TRI)) { 1516 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1517 return false; 1518 } 1519 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1520 } 1521 return true; 1522 } else { 1523 for (auto &MOP : MI.operands()) { 1524 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1525 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1526 continue; 1527 1528 if (!canRenameMOP(MOP, TRI)) { 1529 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1530 return false; 1531 } 1532 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1533 } 1534 } 1535 return true; 1536 }; 1537 1538 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs)) 1539 return false; 1540 1541 if (!FoundDef) { 1542 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n"); 1543 return false; 1544 } 1545 return true; 1546 } 1547 1548 // We want to merge the second load into the first by rewriting the usages of 1549 // the same reg between first (incl.) and second (excl.). We don't need to care 1550 // about any insns before FirstLoad or after SecondLoad. 1551 // 1. The second load writes new value into the same reg. 1552 // - The renaming is impossible to impact later use of the reg. 1553 // - The second load always trash the value written by the first load which 1554 // means the reg must be killed before the second load. 1555 // 2. The first load must be a def for the same reg so we don't need to look 1556 // into anything before it. 1557 static bool canRenameUntilSecondLoad( 1558 MachineInstr &FirstLoad, MachineInstr &SecondLoad, 1559 LiveRegUnits &UsedInBetween, 1560 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1561 const TargetRegisterInfo *TRI) { 1562 if (FirstLoad.isPseudo()) 1563 return false; 1564 1565 UsedInBetween.accumulate(FirstLoad); 1566 auto RegToRename = getLdStRegOp(FirstLoad).getReg(); 1567 bool Success = std::all_of( 1568 FirstLoad.getIterator(), SecondLoad.getIterator(), 1569 [&](MachineInstr &MI) { 1570 LLVM_DEBUG(dbgs() << "Checking " << MI); 1571 // Currently we do not try to rename across frame-setup instructions. 1572 if (MI.getFlag(MachineInstr::FrameSetup)) { 1573 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1574 << "currently\n"); 1575 return false; 1576 } 1577 1578 for (auto &MOP : MI.operands()) { 1579 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1580 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1581 continue; 1582 if (!canRenameMOP(MOP, TRI)) { 1583 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1584 return false; 1585 } 1586 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1587 } 1588 1589 return true; 1590 }); 1591 return Success; 1592 } 1593 1594 // Check if we can find a physical register for renaming \p Reg. This register 1595 // must: 1596 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all 1597 // defined registers up to the point where the renamed register will be used, 1598 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed 1599 // registers in the range the rename register will be used, 1600 // * is available in all used register classes (checked using RequiredClasses). 1601 static std::optional<MCPhysReg> tryToFindRegisterToRename( 1602 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, 1603 LiveRegUnits &UsedInBetween, 1604 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1605 const TargetRegisterInfo *TRI) { 1606 const MachineRegisterInfo &RegInfo = MF.getRegInfo(); 1607 1608 // Checks if any sub- or super-register of PR is callee saved. 1609 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { 1610 return any_of(TRI->sub_and_superregs_inclusive(PR), 1611 [&MF, TRI](MCPhysReg SubOrSuper) { 1612 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF); 1613 }); 1614 }; 1615 1616 // Check if PR or one of its sub- or super-registers can be used for all 1617 // required register classes. 1618 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) { 1619 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) { 1620 return any_of( 1621 TRI->sub_and_superregs_inclusive(PR), 1622 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); }); 1623 }); 1624 }; 1625 1626 auto *RegClass = TRI->getMinimalPhysRegClass(Reg); 1627 for (const MCPhysReg &PR : *RegClass) { 1628 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && 1629 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && 1630 CanBeUsedForAllClasses(PR)) { 1631 DefinedInBB.addReg(PR); 1632 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI) 1633 << "\n"); 1634 return {PR}; 1635 } 1636 } 1637 LLVM_DEBUG(dbgs() << "No rename register found from " 1638 << TRI->getRegClassName(RegClass) << "\n"); 1639 return std::nullopt; 1640 } 1641 1642 // For store pairs: returns a register from FirstMI to the beginning of the 1643 // block that can be renamed. 1644 // For load pairs: returns a register from FirstMI to MI that can be renamed. 1645 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair( 1646 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI, 1647 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, 1648 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1649 const TargetRegisterInfo *TRI) { 1650 std::optional<MCPhysReg> RenameReg; 1651 if (!DebugCounter::shouldExecute(RegRenamingCounter)) 1652 return RenameReg; 1653 1654 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); 1655 MachineFunction &MF = *FirstMI.getParent()->getParent(); 1656 if (!RegClass || !MF.getRegInfo().tracksLiveness()) 1657 return RenameReg; 1658 1659 const bool IsLoad = FirstMI.mayLoad(); 1660 1661 if (!MaybeCanRename) { 1662 if (IsLoad) 1663 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween, 1664 RequiredClasses, TRI)}; 1665 else 1666 MaybeCanRename = { 1667 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)}; 1668 } 1669 1670 if (*MaybeCanRename) { 1671 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween, 1672 RequiredClasses, TRI); 1673 } 1674 return RenameReg; 1675 } 1676 1677 /// Scan the instructions looking for a load/store that can be combined with the 1678 /// current instruction into a wider equivalent or a load/store pair. 1679 MachineBasicBlock::iterator 1680 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1681 LdStPairFlags &Flags, unsigned Limit, 1682 bool FindNarrowMerge) { 1683 MachineBasicBlock::iterator E = I->getParent()->end(); 1684 MachineBasicBlock::iterator MBBI = I; 1685 MachineBasicBlock::iterator MBBIWithRenameReg; 1686 MachineInstr &FirstMI = *I; 1687 MBBI = next_nodbg(MBBI, E); 1688 1689 bool MayLoad = FirstMI.mayLoad(); 1690 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); 1691 Register Reg = getLdStRegOp(FirstMI).getReg(); 1692 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); 1693 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); 1694 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; 1695 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1696 1697 std::optional<bool> MaybeCanRename; 1698 if (!EnableRenaming) 1699 MaybeCanRename = {false}; 1700 1701 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; 1702 LiveRegUnits UsedInBetween; 1703 UsedInBetween.init(*TRI); 1704 1705 Flags.clearRenameReg(); 1706 1707 // Track which register units have been modified and used between the first 1708 // insn (inclusive) and the second insn. 1709 ModifiedRegUnits.clear(); 1710 UsedRegUnits.clear(); 1711 1712 // Remember any instructions that read/write memory between FirstMI and MI. 1713 SmallVector<MachineInstr *, 4> MemInsns; 1714 1715 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump()); 1716 for (unsigned Count = 0; MBBI != E && Count < Limit; 1717 MBBI = next_nodbg(MBBI, E)) { 1718 MachineInstr &MI = *MBBI; 1719 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump()); 1720 1721 UsedInBetween.accumulate(MI); 1722 1723 // Don't count transient instructions towards the search limit since there 1724 // may be different numbers of them if e.g. debug information is present. 1725 if (!MI.isTransient()) 1726 ++Count; 1727 1728 Flags.setSExtIdx(-1); 1729 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && 1730 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { 1731 assert(MI.mayLoadOrStore() && "Expected memory operation."); 1732 // If we've found another instruction with the same opcode, check to see 1733 // if the base and offset are compatible with our starting instruction. 1734 // These instructions all have scaled immediate operands, so we just 1735 // check for +1/-1. Make sure to check the new instruction offset is 1736 // actually an immediate and not a symbolic reference destined for 1737 // a relocation. 1738 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); 1739 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 1740 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); 1741 if (IsUnscaled != MIIsUnscaled) { 1742 // We're trying to pair instructions that differ in how they are scaled. 1743 // If FirstMI is scaled then scale the offset of MI accordingly. 1744 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1745 int MemSize = TII->getMemScale(MI); 1746 if (MIIsUnscaled) { 1747 // If the unscaled offset isn't a multiple of the MemSize, we can't 1748 // pair the operations together: bail and keep looking. 1749 if (MIOffset % MemSize) { 1750 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1751 UsedRegUnits, TRI); 1752 MemInsns.push_back(&MI); 1753 continue; 1754 } 1755 MIOffset /= MemSize; 1756 } else { 1757 MIOffset *= MemSize; 1758 } 1759 } 1760 1761 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI); 1762 1763 if (BaseReg == MIBaseReg) { 1764 // If the offset of the second ld/st is not equal to the size of the 1765 // destination register it can’t be paired with a pre-index ld/st 1766 // pair. Additionally if the base reg is used or modified the operations 1767 // can't be paired: bail and keep looking. 1768 if (IsPreLdSt) { 1769 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); 1770 bool IsBaseRegUsed = !UsedRegUnits.available( 1771 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1772 bool IsBaseRegModified = !ModifiedRegUnits.available( 1773 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1774 // If the stored value and the address of the second instruction is 1775 // the same, it needs to be using the updated register and therefore 1776 // it must not be folded. 1777 bool IsMIRegTheSame = 1778 TRI->regsOverlap(getLdStRegOp(MI).getReg(), 1779 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1780 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || 1781 IsMIRegTheSame) { 1782 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1783 UsedRegUnits, TRI); 1784 MemInsns.push_back(&MI); 1785 continue; 1786 } 1787 } else { 1788 if ((Offset != MIOffset + OffsetStride) && 1789 (Offset + OffsetStride != MIOffset)) { 1790 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1791 UsedRegUnits, TRI); 1792 MemInsns.push_back(&MI); 1793 continue; 1794 } 1795 } 1796 1797 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1798 if (FindNarrowMerge) { 1799 // If the alignment requirements of the scaled wide load/store 1800 // instruction can't express the offset of the scaled narrow input, 1801 // bail and keep looking. For promotable zero stores, allow only when 1802 // the stored value is the same (i.e., WZR). 1803 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || 1804 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1805 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1806 UsedRegUnits, TRI); 1807 MemInsns.push_back(&MI); 1808 continue; 1809 } 1810 } else { 1811 // Pairwise instructions have a 7-bit signed offset field. Single 1812 // insns have a 12-bit unsigned offset field. If the resultant 1813 // immediate offset of merging these instructions is out of range for 1814 // a pairwise instruction, bail and keep looking. 1815 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1816 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1817 UsedRegUnits, TRI); 1818 MemInsns.push_back(&MI); 1819 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, " 1820 << "keep looking.\n"); 1821 continue; 1822 } 1823 // If the alignment requirements of the paired (scaled) instruction 1824 // can't express the offset of the unscaled input, bail and keep 1825 // looking. 1826 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1827 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1828 UsedRegUnits, TRI); 1829 MemInsns.push_back(&MI); 1830 LLVM_DEBUG(dbgs() 1831 << "Offset doesn't fit due to alignment requirements, " 1832 << "keep looking.\n"); 1833 continue; 1834 } 1835 } 1836 1837 // If the BaseReg has been modified, then we cannot do the optimization. 1838 // For example, in the following pattern 1839 // ldr x1 [x2] 1840 // ldr x2 [x3] 1841 // ldr x4 [x2, #8], 1842 // the first and third ldr cannot be converted to ldp x1, x4, [x2] 1843 if (!ModifiedRegUnits.available(BaseReg)) 1844 return E; 1845 1846 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq( 1847 Reg, getLdStRegOp(MI).getReg()); 1848 1849 // If the Rt of the second instruction (destination register of the 1850 // load) was not modified or used between the two instructions and none 1851 // of the instructions between the second and first alias with the 1852 // second, we can combine the second into the first. 1853 bool RtNotModified = 1854 ModifiedRegUnits.available(getLdStRegOp(MI).getReg()); 1855 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg && 1856 !UsedRegUnits.available(getLdStRegOp(MI).getReg())); 1857 1858 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n" 1859 << "Reg '" << getLdStRegOp(MI) << "' not modified: " 1860 << (RtNotModified ? "true" : "false") << "\n" 1861 << "Reg '" << getLdStRegOp(MI) << "' not used: " 1862 << (RtNotUsed ? "true" : "false") << "\n"); 1863 1864 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) { 1865 // For pairs loading into the same reg, try to find a renaming 1866 // opportunity to allow the renaming of Reg between FirstMI and MI 1867 // and combine MI into FirstMI; otherwise bail and keep looking. 1868 if (SameLoadReg) { 1869 std::optional<MCPhysReg> RenameReg = 1870 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI, 1871 Reg, DefinedInBB, UsedInBetween, 1872 RequiredClasses, TRI); 1873 if (!RenameReg) { 1874 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1875 UsedRegUnits, TRI); 1876 MemInsns.push_back(&MI); 1877 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, " 1878 << "keep looking.\n"); 1879 continue; 1880 } 1881 Flags.setRenameReg(*RenameReg); 1882 } 1883 1884 Flags.setMergeForward(false); 1885 if (!SameLoadReg) 1886 Flags.clearRenameReg(); 1887 return MBBI; 1888 } 1889 1890 // Likewise, if the Rt of the first instruction is not modified or used 1891 // between the two instructions and none of the instructions between the 1892 // first and the second alias with the first, we can combine the first 1893 // into the second. 1894 RtNotModified = !( 1895 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())); 1896 1897 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n" 1898 << "Reg '" << getLdStRegOp(FirstMI) 1899 << "' not modified: " 1900 << (RtNotModified ? "true" : "false") << "\n"); 1901 1902 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) { 1903 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) { 1904 Flags.setMergeForward(true); 1905 Flags.clearRenameReg(); 1906 return MBBI; 1907 } 1908 1909 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair( 1910 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween, 1911 RequiredClasses, TRI); 1912 if (RenameReg) { 1913 Flags.setMergeForward(true); 1914 Flags.setRenameReg(*RenameReg); 1915 MBBIWithRenameReg = MBBI; 1916 } 1917 } 1918 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to " 1919 << "interference in between, keep looking.\n"); 1920 } 1921 } 1922 1923 if (Flags.getRenameReg()) 1924 return MBBIWithRenameReg; 1925 1926 // If the instruction wasn't a matching load or store. Stop searching if we 1927 // encounter a call instruction that might modify memory. 1928 if (MI.isCall()) { 1929 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n"); 1930 return E; 1931 } 1932 1933 // Update modified / uses register units. 1934 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1935 1936 // Otherwise, if the base register is modified, we have no match, so 1937 // return early. 1938 if (!ModifiedRegUnits.available(BaseReg)) { 1939 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n"); 1940 return E; 1941 } 1942 1943 // Update list of instructions that read/write memory. 1944 if (MI.mayLoadOrStore()) 1945 MemInsns.push_back(&MI); 1946 } 1947 return E; 1948 } 1949 1950 static MachineBasicBlock::iterator 1951 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { 1952 auto End = MI.getParent()->end(); 1953 if (MaybeCFI == End || 1954 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || 1955 !(MI.getFlag(MachineInstr::FrameSetup) || 1956 MI.getFlag(MachineInstr::FrameDestroy)) || 1957 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) 1958 return End; 1959 1960 const MachineFunction &MF = *MI.getParent()->getParent(); 1961 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex(); 1962 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex]; 1963 switch (CFI.getOperation()) { 1964 case MCCFIInstruction::OpDefCfa: 1965 case MCCFIInstruction::OpDefCfaOffset: 1966 return MaybeCFI; 1967 default: 1968 return End; 1969 } 1970 } 1971 1972 MachineBasicBlock::iterator 1973 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1974 MachineBasicBlock::iterator Update, 1975 bool IsPreIdx) { 1976 assert((Update->getOpcode() == AArch64::ADDXri || 1977 Update->getOpcode() == AArch64::SUBXri) && 1978 "Unexpected base register update instruction to merge!"); 1979 MachineBasicBlock::iterator E = I->getParent()->end(); 1980 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 1981 1982 // If updating the SP and the following instruction is CFA offset related CFI 1983 // instruction move it after the merged instruction. 1984 MachineBasicBlock::iterator CFI = 1985 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E; 1986 1987 // Return the instruction following the merged instruction, which is 1988 // the instruction following our unmerged load. Unless that's the add/sub 1989 // instruction we're merging, in which case it's the one after that. 1990 if (NextI == Update) 1991 NextI = next_nodbg(NextI, E); 1992 1993 int Value = Update->getOperand(2).getImm(); 1994 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 1995 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 1996 if (Update->getOpcode() == AArch64::SUBXri) 1997 Value = -Value; 1998 1999 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 2000 : getPostIndexedOpcode(I->getOpcode()); 2001 MachineInstrBuilder MIB; 2002 int Scale, MinOffset, MaxOffset; 2003 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); 2004 if (!AArch64InstrInfo::isPairedLdSt(*I)) { 2005 // Non-paired instruction. 2006 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2007 .add(getLdStRegOp(*Update)) 2008 .add(getLdStRegOp(*I)) 2009 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2010 .addImm(Value / Scale) 2011 .setMemRefs(I->memoperands()) 2012 .setMIFlags(I->mergeFlagsWith(*Update)); 2013 } else { 2014 // Paired instruction. 2015 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2016 .add(getLdStRegOp(*Update)) 2017 .add(getLdStRegOp(*I, 0)) 2018 .add(getLdStRegOp(*I, 1)) 2019 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2020 .addImm(Value / Scale) 2021 .setMemRefs(I->memoperands()) 2022 .setMIFlags(I->mergeFlagsWith(*Update)); 2023 } 2024 if (CFI != E) { 2025 MachineBasicBlock *MBB = I->getParent(); 2026 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI); 2027 } 2028 2029 if (IsPreIdx) { 2030 ++NumPreFolded; 2031 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); 2032 } else { 2033 ++NumPostFolded; 2034 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); 2035 } 2036 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2037 LLVM_DEBUG(I->print(dbgs())); 2038 LLVM_DEBUG(dbgs() << " "); 2039 LLVM_DEBUG(Update->print(dbgs())); 2040 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2041 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 2042 LLVM_DEBUG(dbgs() << "\n"); 2043 2044 // Erase the old instructions for the block. 2045 I->eraseFromParent(); 2046 Update->eraseFromParent(); 2047 2048 return NextI; 2049 } 2050 2051 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, 2052 MachineInstr &MI, 2053 unsigned BaseReg, int Offset) { 2054 switch (MI.getOpcode()) { 2055 default: 2056 break; 2057 case AArch64::SUBXri: 2058 case AArch64::ADDXri: 2059 // Make sure it's a vanilla immediate operand, not a relocation or 2060 // anything else we can't handle. 2061 if (!MI.getOperand(2).isImm()) 2062 break; 2063 // Watch out for 1 << 12 shifted value. 2064 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) 2065 break; 2066 2067 // The update instruction source and destination register must be the 2068 // same as the load/store base register. 2069 if (MI.getOperand(0).getReg() != BaseReg || 2070 MI.getOperand(1).getReg() != BaseReg) 2071 break; 2072 2073 int UpdateOffset = MI.getOperand(2).getImm(); 2074 if (MI.getOpcode() == AArch64::SUBXri) 2075 UpdateOffset = -UpdateOffset; 2076 2077 // The immediate must be a multiple of the scaling factor of the pre/post 2078 // indexed instruction. 2079 int Scale, MinOffset, MaxOffset; 2080 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); 2081 if (UpdateOffset % Scale != 0) 2082 break; 2083 2084 // Scaled offset must fit in the instruction immediate. 2085 int ScaledOffset = UpdateOffset / Scale; 2086 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) 2087 break; 2088 2089 // If we have a non-zero Offset, we check that it matches the amount 2090 // we're adding to the register. 2091 if (!Offset || Offset == UpdateOffset) 2092 return true; 2093 break; 2094 } 2095 return false; 2096 } 2097 2098 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 2099 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 2100 MachineBasicBlock::iterator E = I->getParent()->end(); 2101 MachineInstr &MemMI = *I; 2102 MachineBasicBlock::iterator MBBI = I; 2103 2104 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2105 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * 2106 TII->getMemScale(MemMI); 2107 2108 // Scan forward looking for post-index opportunities. Updating instructions 2109 // can't be formed if the memory instruction doesn't have the offset we're 2110 // looking for. 2111 if (MIUnscaledOffset != UnscaledOffset) 2112 return E; 2113 2114 // If the base register overlaps a source/destination register, we can't 2115 // merge the update. This does not apply to tag store instructions which 2116 // ignore the address part of the source register. 2117 // This does not apply to STGPi as well, which does not have unpredictable 2118 // behavior in this case unlike normal stores, and always performs writeback 2119 // after reading the source register value. 2120 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { 2121 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2122 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2123 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2124 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2125 return E; 2126 } 2127 } 2128 2129 // Track which register units have been modified and used between the first 2130 // insn (inclusive) and the second insn. 2131 ModifiedRegUnits.clear(); 2132 UsedRegUnits.clear(); 2133 MBBI = next_nodbg(MBBI, E); 2134 2135 // We can't post-increment the stack pointer if any instruction between 2136 // the memory access (I) and the increment (MBBI) can access the memory 2137 // region defined by [SP, MBBI]. 2138 const bool BaseRegSP = BaseReg == AArch64::SP; 2139 if (BaseRegSP && needsWinCFI(I->getMF())) { 2140 // FIXME: For now, we always block the optimization over SP in windows 2141 // targets as it requires to adjust the unwind/debug info, messing up 2142 // the unwind info can actually cause a miscompile. 2143 return E; 2144 } 2145 2146 for (unsigned Count = 0; MBBI != E && Count < Limit; 2147 MBBI = next_nodbg(MBBI, E)) { 2148 MachineInstr &MI = *MBBI; 2149 2150 // Don't count transient instructions towards the search limit since there 2151 // may be different numbers of them if e.g. debug information is present. 2152 if (!MI.isTransient()) 2153 ++Count; 2154 2155 // If we found a match, return it. 2156 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) 2157 return MBBI; 2158 2159 // Update the status of what the instruction clobbered and used. 2160 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2161 2162 // Otherwise, if the base register is used or modified, we have no match, so 2163 // return early. 2164 // If we are optimizing SP, do not allow instructions that may load or store 2165 // in between the load and the optimized value update. 2166 if (!ModifiedRegUnits.available(BaseReg) || 2167 !UsedRegUnits.available(BaseReg) || 2168 (BaseRegSP && MBBI->mayLoadOrStore())) 2169 return E; 2170 } 2171 return E; 2172 } 2173 2174 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 2175 MachineBasicBlock::iterator I, unsigned Limit) { 2176 MachineBasicBlock::iterator B = I->getParent()->begin(); 2177 MachineBasicBlock::iterator E = I->getParent()->end(); 2178 MachineInstr &MemMI = *I; 2179 MachineBasicBlock::iterator MBBI = I; 2180 MachineFunction &MF = *MemMI.getMF(); 2181 2182 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2183 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); 2184 2185 // If the load/store is the first instruction in the block, there's obviously 2186 // not any matching update. Ditto if the memory offset isn't zero. 2187 if (MBBI == B || Offset != 0) 2188 return E; 2189 // If the base register overlaps a destination register, we can't 2190 // merge the update. 2191 if (!isTagStore(MemMI)) { 2192 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2193 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2194 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2195 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2196 return E; 2197 } 2198 } 2199 2200 const bool BaseRegSP = BaseReg == AArch64::SP; 2201 if (BaseRegSP && needsWinCFI(I->getMF())) { 2202 // FIXME: For now, we always block the optimization over SP in windows 2203 // targets as it requires to adjust the unwind/debug info, messing up 2204 // the unwind info can actually cause a miscompile. 2205 return E; 2206 } 2207 2208 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2209 unsigned RedZoneSize = 2210 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); 2211 2212 // Track which register units have been modified and used between the first 2213 // insn (inclusive) and the second insn. 2214 ModifiedRegUnits.clear(); 2215 UsedRegUnits.clear(); 2216 unsigned Count = 0; 2217 bool MemAcessBeforeSPPreInc = false; 2218 do { 2219 MBBI = prev_nodbg(MBBI, B); 2220 MachineInstr &MI = *MBBI; 2221 2222 // Don't count transient instructions towards the search limit since there 2223 // may be different numbers of them if e.g. debug information is present. 2224 if (!MI.isTransient()) 2225 ++Count; 2226 2227 // If we found a match, return it. 2228 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) { 2229 // Check that the update value is within our red zone limit (which may be 2230 // zero). 2231 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize) 2232 return E; 2233 return MBBI; 2234 } 2235 2236 // Update the status of what the instruction clobbered and used. 2237 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2238 2239 // Otherwise, if the base register is used or modified, we have no match, so 2240 // return early. 2241 if (!ModifiedRegUnits.available(BaseReg) || 2242 !UsedRegUnits.available(BaseReg)) 2243 return E; 2244 // Keep track if we have a memory access before an SP pre-increment, in this 2245 // case we need to validate later that the update amount respects the red 2246 // zone. 2247 if (BaseRegSP && MBBI->mayLoadOrStore()) 2248 MemAcessBeforeSPPreInc = true; 2249 } while (MBBI != B && Count < Limit); 2250 return E; 2251 } 2252 2253 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 2254 MachineBasicBlock::iterator &MBBI) { 2255 MachineInstr &MI = *MBBI; 2256 // If this is a volatile load, don't mess with it. 2257 if (MI.hasOrderedMemoryRef()) 2258 return false; 2259 2260 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy)) 2261 return false; 2262 2263 // Make sure this is a reg+imm. 2264 // FIXME: It is possible to extend it to handle reg+reg cases. 2265 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 2266 return false; 2267 2268 // Look backward up to LdStLimit instructions. 2269 MachineBasicBlock::iterator StoreI; 2270 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 2271 ++NumLoadsFromStoresPromoted; 2272 // Promote the load. Keeping the iterator straight is a 2273 // pain, so we let the merge routine tell us what the next instruction 2274 // is after it's done mucking about. 2275 MBBI = promoteLoadFromStore(MBBI, StoreI); 2276 return true; 2277 } 2278 return false; 2279 } 2280 2281 // Merge adjacent zero stores into a wider store. 2282 bool AArch64LoadStoreOpt::tryToMergeZeroStInst( 2283 MachineBasicBlock::iterator &MBBI) { 2284 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); 2285 MachineInstr &MI = *MBBI; 2286 MachineBasicBlock::iterator E = MI.getParent()->end(); 2287 2288 if (!TII->isCandidateToMergeOrPair(MI)) 2289 return false; 2290 2291 // Look ahead up to LdStLimit instructions for a mergable instruction. 2292 LdStPairFlags Flags; 2293 MachineBasicBlock::iterator MergeMI = 2294 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); 2295 if (MergeMI != E) { 2296 ++NumZeroStoresPromoted; 2297 2298 // Keeping the iterator straight is a pain, so we let the merge routine tell 2299 // us what the next instruction is after it's done mucking about. 2300 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); 2301 return true; 2302 } 2303 return false; 2304 } 2305 2306 // Find loads and stores that can be merged into a single load or store pair 2307 // instruction. 2308 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 2309 MachineInstr &MI = *MBBI; 2310 MachineBasicBlock::iterator E = MI.getParent()->end(); 2311 2312 if (!TII->isCandidateToMergeOrPair(MI)) 2313 return false; 2314 2315 // If disable-ldp feature is opted, do not emit ldp. 2316 if (MI.mayLoad() && Subtarget->hasDisableLdp()) 2317 return false; 2318 2319 // If disable-stp feature is opted, do not emit stp. 2320 if (MI.mayStore() && Subtarget->hasDisableStp()) 2321 return false; 2322 2323 // Early exit if the offset is not possible to match. (6 bits of positive 2324 // range, plus allow an extra one in case we find a later insn that matches 2325 // with Offset-1) 2326 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); 2327 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 2328 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; 2329 // Allow one more for offset. 2330 if (Offset > 0) 2331 Offset -= OffsetStride; 2332 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 2333 return false; 2334 2335 // Look ahead up to LdStLimit instructions for a pairable instruction. 2336 LdStPairFlags Flags; 2337 MachineBasicBlock::iterator Paired = 2338 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); 2339 if (Paired != E) { 2340 ++NumPairCreated; 2341 if (TII->hasUnscaledLdStOffset(MI)) 2342 ++NumUnscaledPairCreated; 2343 // Keeping the iterator straight is a pain, so we let the merge routine tell 2344 // us what the next instruction is after it's done mucking about. 2345 auto Prev = std::prev(MBBI); 2346 2347 // Fetch the memoperand of the load/store that is a candidate for 2348 // combination. 2349 MachineMemOperand *MemOp = 2350 MI.memoperands_empty() ? nullptr : MI.memoperands().front(); 2351 2352 // Get the needed alignments to check them if 2353 // ldp-aligned-only/stp-aligned-only features are opted. 2354 uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1; 2355 uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1; 2356 2357 // If a load arrives and ldp-aligned-only feature is opted, check that the 2358 // alignment of the source pointer is at least double the alignment of the 2359 // type. 2360 if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp && 2361 MemAlignment < 2 * TypeAlignment) 2362 return false; 2363 2364 // If a store arrives and stp-aligned-only feature is opted, check that the 2365 // alignment of the source pointer is at least double the alignment of the 2366 // type. 2367 if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp && 2368 MemAlignment < 2 * TypeAlignment) 2369 return false; 2370 2371 MBBI = mergePairedInsns(MBBI, Paired, Flags); 2372 // Collect liveness info for instructions between Prev and the new position 2373 // MBBI. 2374 for (auto I = std::next(Prev); I != MBBI; I++) 2375 updateDefinedRegisters(*I, DefinedInBB, TRI); 2376 2377 return true; 2378 } 2379 return false; 2380 } 2381 2382 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate 2383 (MachineBasicBlock::iterator &MBBI) { 2384 MachineInstr &MI = *MBBI; 2385 MachineBasicBlock::iterator E = MI.getParent()->end(); 2386 MachineBasicBlock::iterator Update; 2387 2388 // Look forward to try to form a post-index instruction. For example, 2389 // ldr x0, [x20] 2390 // add x20, x20, #32 2391 // merged into: 2392 // ldr x0, [x20], #32 2393 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 2394 if (Update != E) { 2395 // Merge the update into the ld/st. 2396 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 2397 return true; 2398 } 2399 2400 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2401 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2402 return false; 2403 2404 // Look back to try to find a pre-index instruction. For example, 2405 // add x0, x0, #8 2406 // ldr x1, [x0] 2407 // merged into: 2408 // ldr x1, [x0, #8]! 2409 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 2410 if (Update != E) { 2411 // Merge the update into the ld/st. 2412 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2413 return true; 2414 } 2415 2416 // The immediate in the load/store is scaled by the size of the memory 2417 // operation. The immediate in the add we're looking for, 2418 // however, is not, so adjust here. 2419 int UnscaledOffset = 2420 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); 2421 2422 // Look forward to try to find a pre-index instruction. For example, 2423 // ldr x1, [x0, #64] 2424 // add x0, x0, #64 2425 // merged into: 2426 // ldr x1, [x0, #64]! 2427 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 2428 if (Update != E) { 2429 // Merge the update into the ld/st. 2430 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2431 return true; 2432 } 2433 2434 return false; 2435 } 2436 2437 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 2438 bool EnableNarrowZeroStOpt) { 2439 2440 bool Modified = false; 2441 // Four tranformations to do here: 2442 // 1) Find loads that directly read from stores and promote them by 2443 // replacing with mov instructions. If the store is wider than the load, 2444 // the load will be replaced with a bitfield extract. 2445 // e.g., 2446 // str w1, [x0, #4] 2447 // ldrh w2, [x0, #6] 2448 // ; becomes 2449 // str w1, [x0, #4] 2450 // lsr w2, w1, #16 2451 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2452 MBBI != E;) { 2453 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) 2454 Modified = true; 2455 else 2456 ++MBBI; 2457 } 2458 // 2) Merge adjacent zero stores into a wider store. 2459 // e.g., 2460 // strh wzr, [x0] 2461 // strh wzr, [x0, #2] 2462 // ; becomes 2463 // str wzr, [x0] 2464 // e.g., 2465 // str wzr, [x0] 2466 // str wzr, [x0, #4] 2467 // ; becomes 2468 // str xzr, [x0] 2469 if (EnableNarrowZeroStOpt) 2470 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2471 MBBI != E;) { 2472 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) 2473 Modified = true; 2474 else 2475 ++MBBI; 2476 } 2477 // 3) Find loads and stores that can be merged into a single load or store 2478 // pair instruction. 2479 // e.g., 2480 // ldr x0, [x2] 2481 // ldr x1, [x2, #8] 2482 // ; becomes 2483 // ldp x0, x1, [x2] 2484 2485 if (MBB.getParent()->getRegInfo().tracksLiveness()) { 2486 DefinedInBB.clear(); 2487 DefinedInBB.addLiveIns(MBB); 2488 } 2489 2490 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2491 MBBI != E;) { 2492 // Track currently live registers up to this point, to help with 2493 // searching for a rename register on demand. 2494 updateDefinedRegisters(*MBBI, DefinedInBB, TRI); 2495 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) 2496 Modified = true; 2497 else 2498 ++MBBI; 2499 } 2500 // 4) Find base register updates that can be merged into the load or store 2501 // as a base-reg writeback. 2502 // e.g., 2503 // ldr x0, [x2] 2504 // add x2, x2, #4 2505 // ; becomes 2506 // ldr x0, [x2], #4 2507 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2508 MBBI != E;) { 2509 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) 2510 Modified = true; 2511 else 2512 ++MBBI; 2513 } 2514 2515 return Modified; 2516 } 2517 2518 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 2519 if (skipFunction(Fn.getFunction())) 2520 return false; 2521 2522 Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); 2523 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 2524 TRI = Subtarget->getRegisterInfo(); 2525 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2526 2527 // Resize the modified and used register unit trackers. We do this once 2528 // per function and then clear the register units each time we optimize a load 2529 // or store. 2530 ModifiedRegUnits.init(*TRI); 2531 UsedRegUnits.init(*TRI); 2532 DefinedInBB.init(*TRI); 2533 2534 bool Modified = false; 2535 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); 2536 for (auto &MBB : Fn) { 2537 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt); 2538 Modified |= M; 2539 } 2540 2541 return Modified; 2542 } 2543 2544 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and 2545 // stores near one another? Note: The pre-RA instruction scheduler already has 2546 // hooks to try and schedule pairable loads/stores together to improve pairing 2547 // opportunities. Thus, pre-RA pairing pass may not be worth the effort. 2548 2549 // FIXME: When pairing store instructions it's very possible for this pass to 2550 // hoist a store with a KILL marker above another use (without a KILL marker). 2551 // The resulting IR is invalid, but nothing uses the KILL markers after this 2552 // pass, so it's never caused a problem in practice. 2553 2554 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 2555 /// load / store optimization pass. 2556 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 2557 return new AArch64LoadStoreOpt(); 2558 } 2559