1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass that performs load / store related peephole 10 // optimizations. This pass should be run after register allocation. 11 // 12 // The pass runs after the PrologEpilogInserter where we emit the CFI 13 // instructions. In order to preserve the correctness of the unwind informaiton, 14 // the pass should not change the order of any two instructions, one of which 15 // has the FrameSetup/FrameDestroy flag or, alternatively, apply an add-hoc fix 16 // to unwind information. 17 // 18 //===----------------------------------------------------------------------===// 19 20 #include "AArch64InstrInfo.h" 21 #include "AArch64MachineFunctionInfo.h" 22 #include "AArch64Subtarget.h" 23 #include "MCTargetDesc/AArch64AddressingModes.h" 24 #include "llvm/ADT/SmallVector.h" 25 #include "llvm/ADT/Statistic.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/iterator_range.h" 28 #include "llvm/Analysis/AliasAnalysis.h" 29 #include "llvm/CodeGen/MachineBasicBlock.h" 30 #include "llvm/CodeGen/MachineFunction.h" 31 #include "llvm/CodeGen/MachineFunctionPass.h" 32 #include "llvm/CodeGen/MachineInstr.h" 33 #include "llvm/CodeGen/MachineInstrBuilder.h" 34 #include "llvm/CodeGen/MachineOperand.h" 35 #include "llvm/CodeGen/MachineRegisterInfo.h" 36 #include "llvm/CodeGen/TargetRegisterInfo.h" 37 #include "llvm/IR/DebugLoc.h" 38 #include "llvm/MC/MCAsmInfo.h" 39 #include "llvm/MC/MCDwarf.h" 40 #include "llvm/MC/MCRegisterInfo.h" 41 #include "llvm/Pass.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Debug.h" 44 #include "llvm/Support/DebugCounter.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/raw_ostream.h" 47 #include <cassert> 48 #include <cstdint> 49 #include <functional> 50 #include <iterator> 51 #include <limits> 52 #include <optional> 53 54 using namespace llvm; 55 56 #define DEBUG_TYPE "aarch64-ldst-opt" 57 58 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 59 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 60 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 61 STATISTIC(NumUnscaledPairCreated, 62 "Number of load/store from unscaled generated"); 63 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 64 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 65 STATISTIC(NumFailedAlignmentCheck, "Number of load/store pair transformation " 66 "not passed the alignment check"); 67 68 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", 69 "Controls which pairs are considered for renaming"); 70 71 // The LdStLimit limits how far we search for load/store pairs. 72 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 73 cl::init(20), cl::Hidden); 74 75 // The UpdateLimit limits how far we search for update instructions when we form 76 // pre-/post-index instructions. 77 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 78 cl::Hidden); 79 80 // Enable register renaming to find additional store pairing opportunities. 81 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming", 82 cl::init(true), cl::Hidden); 83 84 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 85 86 namespace { 87 88 using LdStPairFlags = struct LdStPairFlags { 89 // If a matching instruction is found, MergeForward is set to true if the 90 // merge is to remove the first instruction and replace the second with 91 // a pair-wise insn, and false if the reverse is true. 92 bool MergeForward = false; 93 94 // SExtIdx gives the index of the result of the load pair that must be 95 // extended. The value of SExtIdx assumes that the paired load produces the 96 // value in this order: (I, returned iterator), i.e., -1 means no value has 97 // to be extended, 0 means I, and 1 means the returned iterator. 98 int SExtIdx = -1; 99 100 // If not none, RenameReg can be used to rename the result register of the 101 // first store in a pair. Currently this only works when merging stores 102 // forward. 103 std::optional<MCPhysReg> RenameReg; 104 105 LdStPairFlags() = default; 106 107 void setMergeForward(bool V = true) { MergeForward = V; } 108 bool getMergeForward() const { return MergeForward; } 109 110 void setSExtIdx(int V) { SExtIdx = V; } 111 int getSExtIdx() const { return SExtIdx; } 112 113 void setRenameReg(MCPhysReg R) { RenameReg = R; } 114 void clearRenameReg() { RenameReg = std::nullopt; } 115 std::optional<MCPhysReg> getRenameReg() const { return RenameReg; } 116 }; 117 118 struct AArch64LoadStoreOpt : public MachineFunctionPass { 119 static char ID; 120 121 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 122 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 123 } 124 125 AliasAnalysis *AA; 126 const AArch64InstrInfo *TII; 127 const TargetRegisterInfo *TRI; 128 const AArch64Subtarget *Subtarget; 129 130 // Track which register units have been modified and used. 131 LiveRegUnits ModifiedRegUnits, UsedRegUnits; 132 LiveRegUnits DefinedInBB; 133 134 void getAnalysisUsage(AnalysisUsage &AU) const override { 135 AU.addRequired<AAResultsWrapperPass>(); 136 MachineFunctionPass::getAnalysisUsage(AU); 137 } 138 139 // Scan the instructions looking for a load/store that can be combined 140 // with the current instruction into a load/store pair. 141 // Return the matching instruction if one is found, else MBB->end(). 142 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 143 LdStPairFlags &Flags, 144 unsigned Limit, 145 bool FindNarrowMerge); 146 147 // Scan the instructions looking for a store that writes to the address from 148 // which the current load instruction reads. Return true if one is found. 149 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 150 MachineBasicBlock::iterator &StoreI); 151 152 // Merge the two instructions indicated into a wider narrow store instruction. 153 MachineBasicBlock::iterator 154 mergeNarrowZeroStores(MachineBasicBlock::iterator I, 155 MachineBasicBlock::iterator MergeMI, 156 const LdStPairFlags &Flags); 157 158 // Merge the two instructions indicated into a single pair-wise instruction. 159 MachineBasicBlock::iterator 160 mergePairedInsns(MachineBasicBlock::iterator I, 161 MachineBasicBlock::iterator Paired, 162 const LdStPairFlags &Flags); 163 164 // Promote the load that reads directly from the address stored to. 165 MachineBasicBlock::iterator 166 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 167 MachineBasicBlock::iterator StoreI); 168 169 // Scan the instruction list to find a base register update that can 170 // be combined with the current instruction (a load or store) using 171 // pre or post indexed addressing with writeback. Scan forwards. 172 MachineBasicBlock::iterator 173 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 174 int UnscaledOffset, unsigned Limit); 175 176 // Scan the instruction list to find a base register update that can 177 // be combined with the current instruction (a load or store) using 178 // pre or post indexed addressing with writeback. Scan backwards. 179 MachineBasicBlock::iterator 180 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 181 182 // Find an instruction that updates the base register of the ld/st 183 // instruction. 184 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, 185 unsigned BaseReg, int Offset); 186 187 // Merge a pre- or post-index base register update into a ld/st instruction. 188 MachineBasicBlock::iterator 189 mergeUpdateInsn(MachineBasicBlock::iterator I, 190 MachineBasicBlock::iterator Update, bool IsPreIdx); 191 192 // Find and merge zero store instructions. 193 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); 194 195 // Find and pair ldr/str instructions. 196 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 197 198 // Find and promote load instructions which read directly from store. 199 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 200 201 // Find and merge a base register updates before or after a ld/st instruction. 202 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); 203 204 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); 205 206 bool runOnMachineFunction(MachineFunction &Fn) override; 207 208 MachineFunctionProperties getRequiredProperties() const override { 209 return MachineFunctionProperties().set( 210 MachineFunctionProperties::Property::NoVRegs); 211 } 212 213 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } 214 }; 215 216 char AArch64LoadStoreOpt::ID = 0; 217 218 } // end anonymous namespace 219 220 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 221 AARCH64_LOAD_STORE_OPT_NAME, false, false) 222 223 static bool isNarrowStore(unsigned Opc) { 224 switch (Opc) { 225 default: 226 return false; 227 case AArch64::STRBBui: 228 case AArch64::STURBBi: 229 case AArch64::STRHHui: 230 case AArch64::STURHHi: 231 return true; 232 } 233 } 234 235 // These instruction set memory tag and either keep memory contents unchanged or 236 // set it to zero, ignoring the address part of the source register. 237 static bool isTagStore(const MachineInstr &MI) { 238 switch (MI.getOpcode()) { 239 default: 240 return false; 241 case AArch64::STGi: 242 case AArch64::STZGi: 243 case AArch64::ST2Gi: 244 case AArch64::STZ2Gi: 245 return true; 246 } 247 } 248 249 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 250 bool *IsValidLdStrOpc = nullptr) { 251 if (IsValidLdStrOpc) 252 *IsValidLdStrOpc = true; 253 switch (Opc) { 254 default: 255 if (IsValidLdStrOpc) 256 *IsValidLdStrOpc = false; 257 return std::numeric_limits<unsigned>::max(); 258 case AArch64::STRDui: 259 case AArch64::STURDi: 260 case AArch64::STRDpre: 261 case AArch64::STRQui: 262 case AArch64::STURQi: 263 case AArch64::STRQpre: 264 case AArch64::STRBBui: 265 case AArch64::STURBBi: 266 case AArch64::STRHHui: 267 case AArch64::STURHHi: 268 case AArch64::STRWui: 269 case AArch64::STRWpre: 270 case AArch64::STURWi: 271 case AArch64::STRXui: 272 case AArch64::STRXpre: 273 case AArch64::STURXi: 274 case AArch64::LDRDui: 275 case AArch64::LDURDi: 276 case AArch64::LDRDpre: 277 case AArch64::LDRQui: 278 case AArch64::LDURQi: 279 case AArch64::LDRQpre: 280 case AArch64::LDRWui: 281 case AArch64::LDURWi: 282 case AArch64::LDRWpre: 283 case AArch64::LDRXui: 284 case AArch64::LDURXi: 285 case AArch64::LDRXpre: 286 case AArch64::STRSui: 287 case AArch64::STURSi: 288 case AArch64::STRSpre: 289 case AArch64::LDRSui: 290 case AArch64::LDURSi: 291 case AArch64::LDRSpre: 292 return Opc; 293 case AArch64::LDRSWui: 294 return AArch64::LDRWui; 295 case AArch64::LDURSWi: 296 return AArch64::LDURWi; 297 case AArch64::LDRSWpre: 298 return AArch64::LDRWpre; 299 } 300 } 301 302 static unsigned getMatchingWideOpcode(unsigned Opc) { 303 switch (Opc) { 304 default: 305 llvm_unreachable("Opcode has no wide equivalent!"); 306 case AArch64::STRBBui: 307 return AArch64::STRHHui; 308 case AArch64::STRHHui: 309 return AArch64::STRWui; 310 case AArch64::STURBBi: 311 return AArch64::STURHHi; 312 case AArch64::STURHHi: 313 return AArch64::STURWi; 314 case AArch64::STURWi: 315 return AArch64::STURXi; 316 case AArch64::STRWui: 317 return AArch64::STRXui; 318 } 319 } 320 321 static unsigned getMatchingPairOpcode(unsigned Opc) { 322 switch (Opc) { 323 default: 324 llvm_unreachable("Opcode has no pairwise equivalent!"); 325 case AArch64::STRSui: 326 case AArch64::STURSi: 327 return AArch64::STPSi; 328 case AArch64::STRSpre: 329 return AArch64::STPSpre; 330 case AArch64::STRDui: 331 case AArch64::STURDi: 332 return AArch64::STPDi; 333 case AArch64::STRDpre: 334 return AArch64::STPDpre; 335 case AArch64::STRQui: 336 case AArch64::STURQi: 337 return AArch64::STPQi; 338 case AArch64::STRQpre: 339 return AArch64::STPQpre; 340 case AArch64::STRWui: 341 case AArch64::STURWi: 342 return AArch64::STPWi; 343 case AArch64::STRWpre: 344 return AArch64::STPWpre; 345 case AArch64::STRXui: 346 case AArch64::STURXi: 347 return AArch64::STPXi; 348 case AArch64::STRXpre: 349 return AArch64::STPXpre; 350 case AArch64::LDRSui: 351 case AArch64::LDURSi: 352 return AArch64::LDPSi; 353 case AArch64::LDRSpre: 354 return AArch64::LDPSpre; 355 case AArch64::LDRDui: 356 case AArch64::LDURDi: 357 return AArch64::LDPDi; 358 case AArch64::LDRDpre: 359 return AArch64::LDPDpre; 360 case AArch64::LDRQui: 361 case AArch64::LDURQi: 362 return AArch64::LDPQi; 363 case AArch64::LDRQpre: 364 return AArch64::LDPQpre; 365 case AArch64::LDRWui: 366 case AArch64::LDURWi: 367 return AArch64::LDPWi; 368 case AArch64::LDRWpre: 369 return AArch64::LDPWpre; 370 case AArch64::LDRXui: 371 case AArch64::LDURXi: 372 return AArch64::LDPXi; 373 case AArch64::LDRXpre: 374 return AArch64::LDPXpre; 375 case AArch64::LDRSWui: 376 case AArch64::LDURSWi: 377 return AArch64::LDPSWi; 378 case AArch64::LDRSWpre: 379 return AArch64::LDPSWpre; 380 } 381 } 382 383 static unsigned isMatchingStore(MachineInstr &LoadInst, 384 MachineInstr &StoreInst) { 385 unsigned LdOpc = LoadInst.getOpcode(); 386 unsigned StOpc = StoreInst.getOpcode(); 387 switch (LdOpc) { 388 default: 389 llvm_unreachable("Unsupported load instruction!"); 390 case AArch64::LDRBBui: 391 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 392 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 393 case AArch64::LDURBBi: 394 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 395 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 396 case AArch64::LDRHHui: 397 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 398 StOpc == AArch64::STRXui; 399 case AArch64::LDURHHi: 400 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 401 StOpc == AArch64::STURXi; 402 case AArch64::LDRWui: 403 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 404 case AArch64::LDURWi: 405 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 406 case AArch64::LDRXui: 407 return StOpc == AArch64::STRXui; 408 case AArch64::LDURXi: 409 return StOpc == AArch64::STURXi; 410 } 411 } 412 413 static unsigned getPreIndexedOpcode(unsigned Opc) { 414 // FIXME: We don't currently support creating pre-indexed loads/stores when 415 // the load or store is the unscaled version. If we decide to perform such an 416 // optimization in the future the cases for the unscaled loads/stores will 417 // need to be added here. 418 switch (Opc) { 419 default: 420 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 421 case AArch64::STRSui: 422 return AArch64::STRSpre; 423 case AArch64::STRDui: 424 return AArch64::STRDpre; 425 case AArch64::STRQui: 426 return AArch64::STRQpre; 427 case AArch64::STRBBui: 428 return AArch64::STRBBpre; 429 case AArch64::STRHHui: 430 return AArch64::STRHHpre; 431 case AArch64::STRWui: 432 return AArch64::STRWpre; 433 case AArch64::STRXui: 434 return AArch64::STRXpre; 435 case AArch64::LDRSui: 436 return AArch64::LDRSpre; 437 case AArch64::LDRDui: 438 return AArch64::LDRDpre; 439 case AArch64::LDRQui: 440 return AArch64::LDRQpre; 441 case AArch64::LDRBBui: 442 return AArch64::LDRBBpre; 443 case AArch64::LDRHHui: 444 return AArch64::LDRHHpre; 445 case AArch64::LDRWui: 446 return AArch64::LDRWpre; 447 case AArch64::LDRXui: 448 return AArch64::LDRXpre; 449 case AArch64::LDRSWui: 450 return AArch64::LDRSWpre; 451 case AArch64::LDPSi: 452 return AArch64::LDPSpre; 453 case AArch64::LDPSWi: 454 return AArch64::LDPSWpre; 455 case AArch64::LDPDi: 456 return AArch64::LDPDpre; 457 case AArch64::LDPQi: 458 return AArch64::LDPQpre; 459 case AArch64::LDPWi: 460 return AArch64::LDPWpre; 461 case AArch64::LDPXi: 462 return AArch64::LDPXpre; 463 case AArch64::STPSi: 464 return AArch64::STPSpre; 465 case AArch64::STPDi: 466 return AArch64::STPDpre; 467 case AArch64::STPQi: 468 return AArch64::STPQpre; 469 case AArch64::STPWi: 470 return AArch64::STPWpre; 471 case AArch64::STPXi: 472 return AArch64::STPXpre; 473 case AArch64::STGi: 474 return AArch64::STGPreIndex; 475 case AArch64::STZGi: 476 return AArch64::STZGPreIndex; 477 case AArch64::ST2Gi: 478 return AArch64::ST2GPreIndex; 479 case AArch64::STZ2Gi: 480 return AArch64::STZ2GPreIndex; 481 case AArch64::STGPi: 482 return AArch64::STGPpre; 483 } 484 } 485 486 static unsigned getPostIndexedOpcode(unsigned Opc) { 487 switch (Opc) { 488 default: 489 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 490 case AArch64::STRSui: 491 case AArch64::STURSi: 492 return AArch64::STRSpost; 493 case AArch64::STRDui: 494 case AArch64::STURDi: 495 return AArch64::STRDpost; 496 case AArch64::STRQui: 497 case AArch64::STURQi: 498 return AArch64::STRQpost; 499 case AArch64::STRBBui: 500 return AArch64::STRBBpost; 501 case AArch64::STRHHui: 502 return AArch64::STRHHpost; 503 case AArch64::STRWui: 504 case AArch64::STURWi: 505 return AArch64::STRWpost; 506 case AArch64::STRXui: 507 case AArch64::STURXi: 508 return AArch64::STRXpost; 509 case AArch64::LDRSui: 510 case AArch64::LDURSi: 511 return AArch64::LDRSpost; 512 case AArch64::LDRDui: 513 case AArch64::LDURDi: 514 return AArch64::LDRDpost; 515 case AArch64::LDRQui: 516 case AArch64::LDURQi: 517 return AArch64::LDRQpost; 518 case AArch64::LDRBBui: 519 return AArch64::LDRBBpost; 520 case AArch64::LDRHHui: 521 return AArch64::LDRHHpost; 522 case AArch64::LDRWui: 523 case AArch64::LDURWi: 524 return AArch64::LDRWpost; 525 case AArch64::LDRXui: 526 case AArch64::LDURXi: 527 return AArch64::LDRXpost; 528 case AArch64::LDRSWui: 529 return AArch64::LDRSWpost; 530 case AArch64::LDPSi: 531 return AArch64::LDPSpost; 532 case AArch64::LDPSWi: 533 return AArch64::LDPSWpost; 534 case AArch64::LDPDi: 535 return AArch64::LDPDpost; 536 case AArch64::LDPQi: 537 return AArch64::LDPQpost; 538 case AArch64::LDPWi: 539 return AArch64::LDPWpost; 540 case AArch64::LDPXi: 541 return AArch64::LDPXpost; 542 case AArch64::STPSi: 543 return AArch64::STPSpost; 544 case AArch64::STPDi: 545 return AArch64::STPDpost; 546 case AArch64::STPQi: 547 return AArch64::STPQpost; 548 case AArch64::STPWi: 549 return AArch64::STPWpost; 550 case AArch64::STPXi: 551 return AArch64::STPXpost; 552 case AArch64::STGi: 553 return AArch64::STGPostIndex; 554 case AArch64::STZGi: 555 return AArch64::STZGPostIndex; 556 case AArch64::ST2Gi: 557 return AArch64::ST2GPostIndex; 558 case AArch64::STZ2Gi: 559 return AArch64::STZ2GPostIndex; 560 case AArch64::STGPi: 561 return AArch64::STGPpost; 562 } 563 } 564 565 static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { 566 567 unsigned OpcA = FirstMI.getOpcode(); 568 unsigned OpcB = MI.getOpcode(); 569 570 switch (OpcA) { 571 default: 572 return false; 573 case AArch64::STRSpre: 574 return (OpcB == AArch64::STRSui) || (OpcB == AArch64::STURSi); 575 case AArch64::STRDpre: 576 return (OpcB == AArch64::STRDui) || (OpcB == AArch64::STURDi); 577 case AArch64::STRQpre: 578 return (OpcB == AArch64::STRQui) || (OpcB == AArch64::STURQi); 579 case AArch64::STRWpre: 580 return (OpcB == AArch64::STRWui) || (OpcB == AArch64::STURWi); 581 case AArch64::STRXpre: 582 return (OpcB == AArch64::STRXui) || (OpcB == AArch64::STURXi); 583 case AArch64::LDRSpre: 584 return (OpcB == AArch64::LDRSui) || (OpcB == AArch64::LDURSi); 585 case AArch64::LDRDpre: 586 return (OpcB == AArch64::LDRDui) || (OpcB == AArch64::LDURDi); 587 case AArch64::LDRQpre: 588 return (OpcB == AArch64::LDRQui) || (OpcB == AArch64::LDURQi); 589 case AArch64::LDRWpre: 590 return (OpcB == AArch64::LDRWui) || (OpcB == AArch64::LDURWi); 591 case AArch64::LDRXpre: 592 return (OpcB == AArch64::LDRXui) || (OpcB == AArch64::LDURXi); 593 case AArch64::LDRSWpre: 594 return (OpcB == AArch64::LDRSWui) || (OpcB == AArch64::LDURSWi); 595 } 596 } 597 598 // Returns the scale and offset range of pre/post indexed variants of MI. 599 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, 600 int &MinOffset, int &MaxOffset) { 601 bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); 602 bool IsTagStore = isTagStore(MI); 603 // ST*G and all paired ldst have the same scale in pre/post-indexed variants 604 // as in the "unsigned offset" variant. 605 // All other pre/post indexed ldst instructions are unscaled. 606 Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; 607 608 if (IsPaired) { 609 MinOffset = -64; 610 MaxOffset = 63; 611 } else { 612 MinOffset = -256; 613 MaxOffset = 255; 614 } 615 } 616 617 static MachineOperand &getLdStRegOp(MachineInstr &MI, 618 unsigned PairedRegOp = 0) { 619 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 620 bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); 621 if (IsPreLdSt) 622 PairedRegOp += 1; 623 unsigned Idx = 624 AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; 625 return MI.getOperand(Idx); 626 } 627 628 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, 629 MachineInstr &StoreInst, 630 const AArch64InstrInfo *TII) { 631 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 632 int LoadSize = TII->getMemScale(LoadInst); 633 int StoreSize = TII->getMemScale(StoreInst); 634 int UnscaledStOffset = 635 TII->hasUnscaledLdStOffset(StoreInst) 636 ? AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() 637 : AArch64InstrInfo::getLdStOffsetOp(StoreInst).getImm() * StoreSize; 638 int UnscaledLdOffset = 639 TII->hasUnscaledLdStOffset(LoadInst) 640 ? AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() 641 : AArch64InstrInfo::getLdStOffsetOp(LoadInst).getImm() * LoadSize; 642 return (UnscaledStOffset <= UnscaledLdOffset) && 643 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 644 } 645 646 static bool isPromotableZeroStoreInst(MachineInstr &MI) { 647 unsigned Opc = MI.getOpcode(); 648 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || 649 isNarrowStore(Opc)) && 650 getLdStRegOp(MI).getReg() == AArch64::WZR; 651 } 652 653 static bool isPromotableLoadFromStore(MachineInstr &MI) { 654 switch (MI.getOpcode()) { 655 default: 656 return false; 657 // Scaled instructions. 658 case AArch64::LDRBBui: 659 case AArch64::LDRHHui: 660 case AArch64::LDRWui: 661 case AArch64::LDRXui: 662 // Unscaled instructions. 663 case AArch64::LDURBBi: 664 case AArch64::LDURHHi: 665 case AArch64::LDURWi: 666 case AArch64::LDURXi: 667 return true; 668 } 669 } 670 671 static bool isMergeableLdStUpdate(MachineInstr &MI) { 672 unsigned Opc = MI.getOpcode(); 673 switch (Opc) { 674 default: 675 return false; 676 // Scaled instructions. 677 case AArch64::STRSui: 678 case AArch64::STRDui: 679 case AArch64::STRQui: 680 case AArch64::STRXui: 681 case AArch64::STRWui: 682 case AArch64::STRHHui: 683 case AArch64::STRBBui: 684 case AArch64::LDRSui: 685 case AArch64::LDRDui: 686 case AArch64::LDRQui: 687 case AArch64::LDRXui: 688 case AArch64::LDRWui: 689 case AArch64::LDRHHui: 690 case AArch64::LDRBBui: 691 case AArch64::STGi: 692 case AArch64::STZGi: 693 case AArch64::ST2Gi: 694 case AArch64::STZ2Gi: 695 case AArch64::STGPi: 696 // Unscaled instructions. 697 case AArch64::STURSi: 698 case AArch64::STURDi: 699 case AArch64::STURQi: 700 case AArch64::STURWi: 701 case AArch64::STURXi: 702 case AArch64::LDURSi: 703 case AArch64::LDURDi: 704 case AArch64::LDURQi: 705 case AArch64::LDURWi: 706 case AArch64::LDURXi: 707 // Paired instructions. 708 case AArch64::LDPSi: 709 case AArch64::LDPSWi: 710 case AArch64::LDPDi: 711 case AArch64::LDPQi: 712 case AArch64::LDPWi: 713 case AArch64::LDPXi: 714 case AArch64::STPSi: 715 case AArch64::STPDi: 716 case AArch64::STPQi: 717 case AArch64::STPWi: 718 case AArch64::STPXi: 719 // Make sure this is a reg+imm (as opposed to an address reloc). 720 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 721 return false; 722 723 return true; 724 } 725 } 726 727 static bool isRewritableImplicitDef(unsigned Opc) { 728 switch (Opc) { 729 default: 730 return false; 731 case AArch64::ORRWrs: 732 case AArch64::ADDWri: 733 return true; 734 } 735 } 736 737 MachineBasicBlock::iterator 738 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, 739 MachineBasicBlock::iterator MergeMI, 740 const LdStPairFlags &Flags) { 741 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && 742 "Expected promotable zero stores."); 743 744 MachineBasicBlock::iterator E = I->getParent()->end(); 745 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 746 // If NextI is the second of the two instructions to be merged, we need 747 // to skip one further. Either way we merge will invalidate the iterator, 748 // and we don't need to scan the new instruction, as it's a pairwise 749 // instruction, which we're not considering for further action anyway. 750 if (NextI == MergeMI) 751 NextI = next_nodbg(NextI, E); 752 753 unsigned Opc = I->getOpcode(); 754 unsigned MergeMIOpc = MergeMI->getOpcode(); 755 bool IsScaled = !TII->hasUnscaledLdStOffset(Opc); 756 bool IsMergedMIScaled = !TII->hasUnscaledLdStOffset(MergeMIOpc); 757 int OffsetStride = IsScaled ? TII->getMemScale(*I) : 1; 758 int MergeMIOffsetStride = IsMergedMIScaled ? TII->getMemScale(*MergeMI) : 1; 759 760 bool MergeForward = Flags.getMergeForward(); 761 // Insert our new paired instruction after whichever of the paired 762 // instructions MergeForward indicates. 763 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 764 // Also based on MergeForward is from where we copy the base register operand 765 // so we get the flags compatible with the input code. 766 const MachineOperand &BaseRegOp = 767 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*MergeMI) 768 : AArch64InstrInfo::getLdStBaseOp(*I); 769 770 // Which register is Rt and which is Rt2 depends on the offset order. 771 int64_t IOffsetInBytes = 772 AArch64InstrInfo::getLdStOffsetOp(*I).getImm() * OffsetStride; 773 int64_t MIOffsetInBytes = 774 AArch64InstrInfo::getLdStOffsetOp(*MergeMI).getImm() * 775 MergeMIOffsetStride; 776 // Select final offset based on the offset order. 777 int64_t OffsetImm; 778 if (IOffsetInBytes > MIOffsetInBytes) 779 OffsetImm = MIOffsetInBytes; 780 else 781 OffsetImm = IOffsetInBytes; 782 783 int NewOpcode = getMatchingWideOpcode(Opc); 784 bool FinalIsScaled = !TII->hasUnscaledLdStOffset(NewOpcode); 785 786 // Adjust final offset if the result opcode is a scaled store. 787 if (FinalIsScaled) { 788 int NewOffsetStride = FinalIsScaled ? TII->getMemScale(NewOpcode) : 1; 789 assert(((OffsetImm % NewOffsetStride) == 0) && 790 "Offset should be a multiple of the store memory scale"); 791 OffsetImm = OffsetImm / NewOffsetStride; 792 } 793 794 // Construct the new instruction. 795 DebugLoc DL = I->getDebugLoc(); 796 MachineBasicBlock *MBB = I->getParent(); 797 MachineInstrBuilder MIB; 798 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 799 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 800 .add(BaseRegOp) 801 .addImm(OffsetImm) 802 .cloneMergedMemRefs({&*I, &*MergeMI}) 803 .setMIFlags(I->mergeFlagsWith(*MergeMI)); 804 (void)MIB; 805 806 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); 807 LLVM_DEBUG(I->print(dbgs())); 808 LLVM_DEBUG(dbgs() << " "); 809 LLVM_DEBUG(MergeMI->print(dbgs())); 810 LLVM_DEBUG(dbgs() << " with instruction:\n "); 811 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 812 LLVM_DEBUG(dbgs() << "\n"); 813 814 // Erase the old instructions. 815 I->eraseFromParent(); 816 MergeMI->eraseFromParent(); 817 return NextI; 818 } 819 820 // Apply Fn to all instructions between MI and the beginning of the block, until 821 // a def for DefReg is reached. Returns true, iff Fn returns true for all 822 // visited instructions. Stop after visiting Limit iterations. 823 static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, 824 const TargetRegisterInfo *TRI, unsigned Limit, 825 std::function<bool(MachineInstr &, bool)> &Fn) { 826 auto MBB = MI.getParent(); 827 for (MachineInstr &I : 828 instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) { 829 if (!Limit) 830 return false; 831 --Limit; 832 833 bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) { 834 return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && 835 TRI->regsOverlap(MOP.getReg(), DefReg); 836 }); 837 if (!Fn(I, isDef)) 838 return false; 839 if (isDef) 840 break; 841 } 842 return true; 843 } 844 845 static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, 846 const TargetRegisterInfo *TRI) { 847 848 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 849 if (MOP.isReg() && MOP.isKill()) 850 Units.removeReg(MOP.getReg()); 851 852 for (const MachineOperand &MOP : phys_regs_and_masks(MI)) 853 if (MOP.isReg() && !MOP.isKill()) 854 Units.addReg(MOP.getReg()); 855 } 856 857 MachineBasicBlock::iterator 858 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 859 MachineBasicBlock::iterator Paired, 860 const LdStPairFlags &Flags) { 861 MachineBasicBlock::iterator E = I->getParent()->end(); 862 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 863 // If NextI is the second of the two instructions to be merged, we need 864 // to skip one further. Either way we merge will invalidate the iterator, 865 // and we don't need to scan the new instruction, as it's a pairwise 866 // instruction, which we're not considering for further action anyway. 867 if (NextI == Paired) 868 NextI = next_nodbg(NextI, E); 869 870 int SExtIdx = Flags.getSExtIdx(); 871 unsigned Opc = 872 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 873 bool IsUnscaled = TII->hasUnscaledLdStOffset(Opc); 874 int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; 875 876 bool MergeForward = Flags.getMergeForward(); 877 878 std::optional<MCPhysReg> RenameReg = Flags.getRenameReg(); 879 if (RenameReg) { 880 MCRegister RegToRename = getLdStRegOp(*I).getReg(); 881 DefinedInBB.addReg(*RenameReg); 882 883 // Return the sub/super register for RenameReg, matching the size of 884 // OriginalReg. 885 auto GetMatchingSubReg = 886 [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg { 887 for (MCPhysReg SubOrSuper : 888 TRI->sub_and_superregs_inclusive(*RenameReg)) { 889 if (C->contains(SubOrSuper)) 890 return SubOrSuper; 891 } 892 llvm_unreachable("Should have found matching sub or super register!"); 893 }; 894 895 std::function<bool(MachineInstr &, bool)> UpdateMIs = 896 [this, RegToRename, GetMatchingSubReg, MergeForward](MachineInstr &MI, 897 bool IsDef) { 898 if (IsDef) { 899 bool SeenDef = false; 900 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 901 MachineOperand &MOP = MI.getOperand(OpIdx); 902 // Rename the first explicit definition and all implicit 903 // definitions matching RegToRename. 904 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 905 (!MergeForward || !SeenDef || 906 (MOP.isDef() && MOP.isImplicit())) && 907 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 908 assert((MOP.isImplicit() || 909 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 910 "Need renamable operands"); 911 Register MatchingReg; 912 if (const TargetRegisterClass *RC = 913 MI.getRegClassConstraint(OpIdx, TII, TRI)) 914 MatchingReg = GetMatchingSubReg(RC); 915 else { 916 if (!isRewritableImplicitDef(MI.getOpcode())) 917 continue; 918 MatchingReg = GetMatchingSubReg( 919 TRI->getMinimalPhysRegClass(MOP.getReg())); 920 } 921 MOP.setReg(MatchingReg); 922 SeenDef = true; 923 } 924 } 925 } else { 926 for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) { 927 MachineOperand &MOP = MI.getOperand(OpIdx); 928 if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 929 TRI->regsOverlap(MOP.getReg(), RegToRename)) { 930 assert((MOP.isImplicit() || 931 (MOP.isRenamable() && !MOP.isEarlyClobber())) && 932 "Need renamable operands"); 933 Register MatchingReg; 934 if (const TargetRegisterClass *RC = 935 MI.getRegClassConstraint(OpIdx, TII, TRI)) 936 MatchingReg = GetMatchingSubReg(RC); 937 else 938 MatchingReg = GetMatchingSubReg( 939 TRI->getMinimalPhysRegClass(MOP.getReg())); 940 assert(MatchingReg != AArch64::NoRegister && 941 "Cannot find matching regs for renaming"); 942 MOP.setReg(MatchingReg); 943 } 944 } 945 } 946 LLVM_DEBUG(dbgs() << "Renamed " << MI); 947 return true; 948 }; 949 forAllMIsUntilDef(MergeForward ? *I : *std::prev(Paired), RegToRename, TRI, 950 UINT32_MAX, UpdateMIs); 951 952 #if !defined(NDEBUG) 953 // For forward merging store: 954 // Make sure the register used for renaming is not used between the 955 // paired instructions. That would trash the content before the new 956 // paired instruction. 957 MCPhysReg RegToCheck = *RenameReg; 958 // For backward merging load: 959 // Make sure the register being renamed is not used between the 960 // paired instructions. That would trash the content after the new 961 // paired instruction. 962 if (!MergeForward) 963 RegToCheck = RegToRename; 964 for (auto &MI : 965 iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>( 966 MergeForward ? std::next(I) : I, 967 MergeForward ? std::next(Paired) : Paired)) 968 assert(all_of(MI.operands(), 969 [this, RegToCheck](const MachineOperand &MOP) { 970 return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 971 MOP.isUndef() || 972 !TRI->regsOverlap(MOP.getReg(), RegToCheck); 973 }) && 974 "Rename register used between paired instruction, trashing the " 975 "content"); 976 #endif 977 } 978 979 // Insert our new paired instruction after whichever of the paired 980 // instructions MergeForward indicates. 981 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 982 // Also based on MergeForward is from where we copy the base register operand 983 // so we get the flags compatible with the input code. 984 const MachineOperand &BaseRegOp = 985 MergeForward ? AArch64InstrInfo::getLdStBaseOp(*Paired) 986 : AArch64InstrInfo::getLdStBaseOp(*I); 987 988 int Offset = AArch64InstrInfo::getLdStOffsetOp(*I).getImm(); 989 int PairedOffset = AArch64InstrInfo::getLdStOffsetOp(*Paired).getImm(); 990 bool PairedIsUnscaled = TII->hasUnscaledLdStOffset(Paired->getOpcode()); 991 if (IsUnscaled != PairedIsUnscaled) { 992 // We're trying to pair instructions that differ in how they are scaled. If 993 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 994 // the opposite (i.e., make Paired's offset unscaled). 995 int MemSize = TII->getMemScale(*Paired); 996 if (PairedIsUnscaled) { 997 // If the unscaled offset isn't a multiple of the MemSize, we can't 998 // pair the operations together. 999 assert(!(PairedOffset % TII->getMemScale(*Paired)) && 1000 "Offset should be a multiple of the stride!"); 1001 PairedOffset /= MemSize; 1002 } else { 1003 PairedOffset *= MemSize; 1004 } 1005 } 1006 1007 // Which register is Rt and which is Rt2 depends on the offset order. 1008 // However, for pre load/stores the Rt should be the one of the pre 1009 // load/store. 1010 MachineInstr *RtMI, *Rt2MI; 1011 if (Offset == PairedOffset + OffsetStride && 1012 !AArch64InstrInfo::isPreLdSt(*I)) { 1013 RtMI = &*Paired; 1014 Rt2MI = &*I; 1015 // Here we swapped the assumption made for SExtIdx. 1016 // I.e., we turn ldp I, Paired into ldp Paired, I. 1017 // Update the index accordingly. 1018 if (SExtIdx != -1) 1019 SExtIdx = (SExtIdx + 1) % 2; 1020 } else { 1021 RtMI = &*I; 1022 Rt2MI = &*Paired; 1023 } 1024 int OffsetImm = AArch64InstrInfo::getLdStOffsetOp(*RtMI).getImm(); 1025 // Scale the immediate offset, if necessary. 1026 if (TII->hasUnscaledLdStOffset(RtMI->getOpcode())) { 1027 assert(!(OffsetImm % TII->getMemScale(*RtMI)) && 1028 "Unscaled offset cannot be scaled."); 1029 OffsetImm /= TII->getMemScale(*RtMI); 1030 } 1031 1032 // Construct the new instruction. 1033 MachineInstrBuilder MIB; 1034 DebugLoc DL = I->getDebugLoc(); 1035 MachineBasicBlock *MBB = I->getParent(); 1036 MachineOperand RegOp0 = getLdStRegOp(*RtMI); 1037 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); 1038 MachineOperand &PairedRegOp = RtMI == &*Paired ? RegOp0 : RegOp1; 1039 // Kill flags may become invalid when moving stores for pairing. 1040 if (RegOp0.isUse()) { 1041 if (!MergeForward) { 1042 // Clear kill flags on store if moving upwards. Example: 1043 // STRWui kill %w0, ... 1044 // USE %w1 1045 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards 1046 // We are about to move the store of w1, so its kill flag may become 1047 // invalid; not the case for w0. 1048 // Since w1 is used between the stores, the kill flag on w1 is cleared 1049 // after merging. 1050 // STPWi kill %w0, %w1, ... 1051 // USE %w1 1052 for (auto It = std::next(I); It != Paired && PairedRegOp.isKill(); ++It) 1053 if (It->readsRegister(PairedRegOp.getReg(), TRI)) 1054 PairedRegOp.setIsKill(false); 1055 } else { 1056 // Clear kill flags of the first stores register. Example: 1057 // STRWui %w1, ... 1058 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards 1059 // STRW %w0 1060 Register Reg = getLdStRegOp(*I).getReg(); 1061 for (MachineInstr &MI : make_range(std::next(I), Paired)) 1062 MI.clearRegisterKills(Reg, TRI); 1063 } 1064 } 1065 1066 unsigned int MatchPairOpcode = getMatchingPairOpcode(Opc); 1067 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(MatchPairOpcode)); 1068 1069 // Adds the pre-index operand for pre-indexed ld/st pairs. 1070 if (AArch64InstrInfo::isPreLdSt(*RtMI)) 1071 MIB.addReg(BaseRegOp.getReg(), RegState::Define); 1072 1073 MIB.add(RegOp0) 1074 .add(RegOp1) 1075 .add(BaseRegOp) 1076 .addImm(OffsetImm) 1077 .cloneMergedMemRefs({&*I, &*Paired}) 1078 .setMIFlags(I->mergeFlagsWith(*Paired)); 1079 1080 (void)MIB; 1081 1082 LLVM_DEBUG( 1083 dbgs() << "Creating pair load/store. Replacing instructions:\n "); 1084 LLVM_DEBUG(I->print(dbgs())); 1085 LLVM_DEBUG(dbgs() << " "); 1086 LLVM_DEBUG(Paired->print(dbgs())); 1087 LLVM_DEBUG(dbgs() << " with instruction:\n "); 1088 if (SExtIdx != -1) { 1089 // Generate the sign extension for the proper result of the ldp. 1090 // I.e., with X1, that would be: 1091 // %w1 = KILL %w1, implicit-def %x1 1092 // %x1 = SBFMXri killed %x1, 0, 31 1093 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 1094 // Right now, DstMO has the extended register, since it comes from an 1095 // extended opcode. 1096 Register DstRegX = DstMO.getReg(); 1097 // Get the W variant of that register. 1098 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 1099 // Update the result of LDP to use the W instead of the X variant. 1100 DstMO.setReg(DstRegW); 1101 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1102 LLVM_DEBUG(dbgs() << "\n"); 1103 // Make the machine verifier happy by providing a definition for 1104 // the X register. 1105 // Insert this definition right after the generated LDP, i.e., before 1106 // InsertionPoint. 1107 MachineInstrBuilder MIBKill = 1108 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 1109 .addReg(DstRegW) 1110 .addReg(DstRegX, RegState::Define); 1111 MIBKill->getOperand(2).setImplicit(); 1112 // Create the sign extension. 1113 MachineInstrBuilder MIBSXTW = 1114 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 1115 .addReg(DstRegX) 1116 .addImm(0) 1117 .addImm(31); 1118 (void)MIBSXTW; 1119 LLVM_DEBUG(dbgs() << " Extend operand:\n "); 1120 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 1121 } else { 1122 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1123 } 1124 LLVM_DEBUG(dbgs() << "\n"); 1125 1126 if (MergeForward) 1127 for (const MachineOperand &MOP : phys_regs_and_masks(*I)) 1128 if (MOP.isReg() && MOP.isKill()) 1129 DefinedInBB.addReg(MOP.getReg()); 1130 1131 // Erase the old instructions. 1132 I->eraseFromParent(); 1133 Paired->eraseFromParent(); 1134 1135 return NextI; 1136 } 1137 1138 MachineBasicBlock::iterator 1139 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 1140 MachineBasicBlock::iterator StoreI) { 1141 MachineBasicBlock::iterator NextI = 1142 next_nodbg(LoadI, LoadI->getParent()->end()); 1143 1144 int LoadSize = TII->getMemScale(*LoadI); 1145 int StoreSize = TII->getMemScale(*StoreI); 1146 Register LdRt = getLdStRegOp(*LoadI).getReg(); 1147 const MachineOperand &StMO = getLdStRegOp(*StoreI); 1148 Register StRt = getLdStRegOp(*StoreI).getReg(); 1149 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 1150 1151 assert((IsStoreXReg || 1152 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 1153 "Unexpected RegClass"); 1154 1155 MachineInstr *BitExtMI; 1156 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 1157 // Remove the load, if the destination register of the loads is the same 1158 // register for stored value. 1159 if (StRt == LdRt && LoadSize == 8) { 1160 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1161 LoadI->getIterator())) { 1162 if (MI.killsRegister(StRt, TRI)) { 1163 MI.clearRegisterKills(StRt, TRI); 1164 break; 1165 } 1166 } 1167 LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); 1168 LLVM_DEBUG(LoadI->print(dbgs())); 1169 LLVM_DEBUG(dbgs() << "\n"); 1170 LoadI->eraseFromParent(); 1171 return NextI; 1172 } 1173 // Replace the load with a mov if the load and store are in the same size. 1174 BitExtMI = 1175 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1176 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 1177 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 1178 .add(StMO) 1179 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 1180 .setMIFlags(LoadI->getFlags()); 1181 } else { 1182 // FIXME: Currently we disable this transformation in big-endian targets as 1183 // performance and correctness are verified only in little-endian. 1184 if (!Subtarget->isLittleEndian()) 1185 return NextI; 1186 bool IsUnscaled = TII->hasUnscaledLdStOffset(*LoadI); 1187 assert(IsUnscaled == TII->hasUnscaledLdStOffset(*StoreI) && 1188 "Unsupported ld/st match"); 1189 assert(LoadSize <= StoreSize && "Invalid load size"); 1190 int UnscaledLdOffset = 1191 IsUnscaled 1192 ? AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() 1193 : AArch64InstrInfo::getLdStOffsetOp(*LoadI).getImm() * LoadSize; 1194 int UnscaledStOffset = 1195 IsUnscaled 1196 ? AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() 1197 : AArch64InstrInfo::getLdStOffsetOp(*StoreI).getImm() * StoreSize; 1198 int Width = LoadSize * 8; 1199 Register DestReg = 1200 IsStoreXReg ? Register(TRI->getMatchingSuperReg( 1201 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) 1202 : LdRt; 1203 1204 assert((UnscaledLdOffset >= UnscaledStOffset && 1205 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 1206 "Invalid offset"); 1207 1208 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 1209 int Imms = Immr + Width - 1; 1210 if (UnscaledLdOffset == UnscaledStOffset) { 1211 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 1212 | ((Immr) << 6) // immr 1213 | ((Imms) << 0) // imms 1214 ; 1215 1216 BitExtMI = 1217 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1218 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 1219 DestReg) 1220 .add(StMO) 1221 .addImm(AndMaskEncoded) 1222 .setMIFlags(LoadI->getFlags()); 1223 } else { 1224 BitExtMI = 1225 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1226 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 1227 DestReg) 1228 .add(StMO) 1229 .addImm(Immr) 1230 .addImm(Imms) 1231 .setMIFlags(LoadI->getFlags()); 1232 } 1233 } 1234 1235 // Clear kill flags between store and load. 1236 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1237 BitExtMI->getIterator())) 1238 if (MI.killsRegister(StRt, TRI)) { 1239 MI.clearRegisterKills(StRt, TRI); 1240 break; 1241 } 1242 1243 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); 1244 LLVM_DEBUG(StoreI->print(dbgs())); 1245 LLVM_DEBUG(dbgs() << " "); 1246 LLVM_DEBUG(LoadI->print(dbgs())); 1247 LLVM_DEBUG(dbgs() << " with instructions:\n "); 1248 LLVM_DEBUG(StoreI->print(dbgs())); 1249 LLVM_DEBUG(dbgs() << " "); 1250 LLVM_DEBUG((BitExtMI)->print(dbgs())); 1251 LLVM_DEBUG(dbgs() << "\n"); 1252 1253 // Erase the old instructions. 1254 LoadI->eraseFromParent(); 1255 return NextI; 1256 } 1257 1258 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1259 // Convert the byte-offset used by unscaled into an "element" offset used 1260 // by the scaled pair load/store instructions. 1261 if (IsUnscaled) { 1262 // If the byte-offset isn't a multiple of the stride, there's no point 1263 // trying to match it. 1264 if (Offset % OffsetStride) 1265 return false; 1266 Offset /= OffsetStride; 1267 } 1268 return Offset <= 63 && Offset >= -64; 1269 } 1270 1271 // Do alignment, specialized to power of 2 and for signed ints, 1272 // avoiding having to do a C-style cast from uint_64t to int when 1273 // using alignTo from include/llvm/Support/MathExtras.h. 1274 // FIXME: Move this function to include/MathExtras.h? 1275 static int alignTo(int Num, int PowOf2) { 1276 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1277 } 1278 1279 static bool mayAlias(MachineInstr &MIa, 1280 SmallVectorImpl<MachineInstr *> &MemInsns, 1281 AliasAnalysis *AA) { 1282 for (MachineInstr *MIb : MemInsns) { 1283 if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false)) { 1284 LLVM_DEBUG(dbgs() << "Aliasing with: "; MIb->dump()); 1285 return true; 1286 } 1287 } 1288 1289 LLVM_DEBUG(dbgs() << "No aliases found\n"); 1290 return false; 1291 } 1292 1293 bool AArch64LoadStoreOpt::findMatchingStore( 1294 MachineBasicBlock::iterator I, unsigned Limit, 1295 MachineBasicBlock::iterator &StoreI) { 1296 MachineBasicBlock::iterator B = I->getParent()->begin(); 1297 MachineBasicBlock::iterator MBBI = I; 1298 MachineInstr &LoadMI = *I; 1299 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(LoadMI).getReg(); 1300 1301 // If the load is the first instruction in the block, there's obviously 1302 // not any matching store. 1303 if (MBBI == B) 1304 return false; 1305 1306 // Track which register units have been modified and used between the first 1307 // insn and the second insn. 1308 ModifiedRegUnits.clear(); 1309 UsedRegUnits.clear(); 1310 1311 unsigned Count = 0; 1312 do { 1313 MBBI = prev_nodbg(MBBI, B); 1314 MachineInstr &MI = *MBBI; 1315 1316 // Don't count transient instructions towards the search limit since there 1317 // may be different numbers of them if e.g. debug information is present. 1318 if (!MI.isTransient()) 1319 ++Count; 1320 1321 // If the load instruction reads directly from the address to which the 1322 // store instruction writes and the stored value is not modified, we can 1323 // promote the load. Since we do not handle stores with pre-/post-index, 1324 // it's unnecessary to check if BaseReg is modified by the store itself. 1325 // Also we can't handle stores without an immediate offset operand, 1326 // while the operand might be the address for a global variable. 1327 if (MI.mayStore() && isMatchingStore(LoadMI, MI) && 1328 BaseReg == AArch64InstrInfo::getLdStBaseOp(MI).getReg() && 1329 AArch64InstrInfo::getLdStOffsetOp(MI).isImm() && 1330 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1331 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { 1332 StoreI = MBBI; 1333 return true; 1334 } 1335 1336 if (MI.isCall()) 1337 return false; 1338 1339 // Update modified / uses register units. 1340 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1341 1342 // Otherwise, if the base register is modified, we have no match, so 1343 // return early. 1344 if (!ModifiedRegUnits.available(BaseReg)) 1345 return false; 1346 1347 // If we encounter a store aliased with the load, return early. 1348 if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false)) 1349 return false; 1350 } while (MBBI != B && Count < Limit); 1351 return false; 1352 } 1353 1354 static bool needsWinCFI(const MachineFunction *MF) { 1355 return MF->getTarget().getMCAsmInfo()->usesWindowsCFI() && 1356 MF->getFunction().needsUnwindTableEntry(); 1357 } 1358 1359 // Returns true if FirstMI and MI are candidates for merging or pairing. 1360 // Otherwise, returns false. 1361 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, 1362 LdStPairFlags &Flags, 1363 const AArch64InstrInfo *TII) { 1364 // If this is volatile or if pairing is suppressed, not a candidate. 1365 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1366 return false; 1367 1368 // We should have already checked FirstMI for pair suppression and volatility. 1369 assert(!FirstMI.hasOrderedMemoryRef() && 1370 !TII->isLdStPairSuppressed(FirstMI) && 1371 "FirstMI shouldn't get here if either of these checks are true."); 1372 1373 if (needsWinCFI(MI.getMF()) && (MI.getFlag(MachineInstr::FrameSetup) || 1374 MI.getFlag(MachineInstr::FrameDestroy))) 1375 return false; 1376 1377 unsigned OpcA = FirstMI.getOpcode(); 1378 unsigned OpcB = MI.getOpcode(); 1379 1380 // Opcodes match: If the opcodes are pre ld/st there is nothing more to check. 1381 if (OpcA == OpcB) 1382 return !AArch64InstrInfo::isPreLdSt(FirstMI); 1383 1384 // Two pre ld/st of different opcodes cannot be merged either 1385 if (AArch64InstrInfo::isPreLdSt(FirstMI) && AArch64InstrInfo::isPreLdSt(MI)) 1386 return false; 1387 1388 // Try to match a sign-extended load/store with a zero-extended load/store. 1389 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1390 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1391 assert(IsValidLdStrOpc && 1392 "Given Opc should be a Load or Store with an immediate"); 1393 // OpcA will be the first instruction in the pair. 1394 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1395 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1396 return true; 1397 } 1398 1399 // If the second instruction isn't even a mergable/pairable load/store, bail 1400 // out. 1401 if (!PairIsValidLdStrOpc) 1402 return false; 1403 1404 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled 1405 // offsets. 1406 if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) 1407 return false; 1408 1409 // The STR<S,D,Q,W,X>pre - STR<S,D,Q,W,X>ui and 1410 // LDR<S,D,Q,W,X,SW>pre-LDR<S,D,Q,W,X,SW>ui 1411 // are candidate pairs that can be merged. 1412 if (isPreLdStPairCandidate(FirstMI, MI)) 1413 return true; 1414 1415 // Try to match an unscaled load/store with a scaled load/store. 1416 return TII->hasUnscaledLdStOffset(OpcA) != TII->hasUnscaledLdStOffset(OpcB) && 1417 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1418 1419 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1420 } 1421 1422 static bool canRenameMOP(const MachineOperand &MOP, 1423 const TargetRegisterInfo *TRI) { 1424 if (MOP.isReg()) { 1425 auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg()); 1426 // Renaming registers with multiple disjunct sub-registers (e.g. the 1427 // result of a LD3) means that all sub-registers are renamed, potentially 1428 // impacting other instructions we did not check. Bail out. 1429 // Note that this relies on the structure of the AArch64 register file. In 1430 // particular, a subregister cannot be written without overwriting the 1431 // whole register. 1432 if (RegClass->HasDisjunctSubRegs) { 1433 LLVM_DEBUG( 1434 dbgs() 1435 << " Cannot rename operands with multiple disjunct subregisters (" 1436 << MOP << ")\n"); 1437 return false; 1438 } 1439 1440 // We cannot rename arbitrary implicit-defs, the specific rule to rewrite 1441 // them must be known. For example, in ORRWrs the implicit-def 1442 // corresponds to the result register. 1443 if (MOP.isImplicit() && MOP.isDef()) { 1444 if (!isRewritableImplicitDef(MOP.getParent()->getOpcode())) 1445 return false; 1446 return TRI->isSuperOrSubRegisterEq( 1447 MOP.getParent()->getOperand(0).getReg(), MOP.getReg()); 1448 } 1449 } 1450 return MOP.isImplicit() || 1451 (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); 1452 } 1453 1454 static bool 1455 canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, 1456 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1457 const TargetRegisterInfo *TRI) { 1458 if (!FirstMI.mayStore()) 1459 return false; 1460 1461 // Check if we can find an unused register which we can use to rename 1462 // the register used by the first load/store. 1463 1464 auto RegToRename = getLdStRegOp(FirstMI).getReg(); 1465 // For now, we only rename if the store operand gets killed at the store. 1466 if (!getLdStRegOp(FirstMI).isKill() && 1467 !any_of(FirstMI.operands(), 1468 [TRI, RegToRename](const MachineOperand &MOP) { 1469 return MOP.isReg() && !MOP.isDebug() && MOP.getReg() && 1470 MOP.isImplicit() && MOP.isKill() && 1471 TRI->regsOverlap(RegToRename, MOP.getReg()); 1472 })) { 1473 LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI); 1474 return false; 1475 } 1476 1477 bool FoundDef = false; 1478 1479 // For each instruction between FirstMI and the previous def for RegToRename, 1480 // we 1481 // * check if we can rename RegToRename in this instruction 1482 // * collect the registers used and required register classes for RegToRename. 1483 std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI, 1484 bool IsDef) { 1485 LLVM_DEBUG(dbgs() << "Checking " << MI); 1486 // Currently we do not try to rename across frame-setup instructions. 1487 if (MI.getFlag(MachineInstr::FrameSetup)) { 1488 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1489 << "currently\n"); 1490 return false; 1491 } 1492 1493 UsedInBetween.accumulate(MI); 1494 1495 // For a definition, check that we can rename the definition and exit the 1496 // loop. 1497 FoundDef = IsDef; 1498 1499 // For defs, check if we can rename the first def of RegToRename. 1500 if (FoundDef) { 1501 // For some pseudo instructions, we might not generate code in the end 1502 // (e.g. KILL) and we would end up without a correct def for the rename 1503 // register. 1504 // TODO: This might be overly conservative and we could handle those cases 1505 // in multiple ways: 1506 // 1. Insert an extra copy, to materialize the def. 1507 // 2. Skip pseudo-defs until we find an non-pseudo def. 1508 if (MI.isPseudo()) { 1509 LLVM_DEBUG(dbgs() << " Cannot rename pseudo/bundle instruction\n"); 1510 return false; 1511 } 1512 1513 for (auto &MOP : MI.operands()) { 1514 if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || 1515 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1516 continue; 1517 if (!canRenameMOP(MOP, TRI)) { 1518 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1519 return false; 1520 } 1521 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1522 } 1523 return true; 1524 } else { 1525 for (auto &MOP : MI.operands()) { 1526 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1527 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1528 continue; 1529 1530 if (!canRenameMOP(MOP, TRI)) { 1531 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1532 return false; 1533 } 1534 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1535 } 1536 } 1537 return true; 1538 }; 1539 1540 if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs)) 1541 return false; 1542 1543 if (!FoundDef) { 1544 LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n"); 1545 return false; 1546 } 1547 return true; 1548 } 1549 1550 // We want to merge the second load into the first by rewriting the usages of 1551 // the same reg between first (incl.) and second (excl.). We don't need to care 1552 // about any insns before FirstLoad or after SecondLoad. 1553 // 1. The second load writes new value into the same reg. 1554 // - The renaming is impossible to impact later use of the reg. 1555 // - The second load always trash the value written by the first load which 1556 // means the reg must be killed before the second load. 1557 // 2. The first load must be a def for the same reg so we don't need to look 1558 // into anything before it. 1559 static bool canRenameUntilSecondLoad( 1560 MachineInstr &FirstLoad, MachineInstr &SecondLoad, 1561 LiveRegUnits &UsedInBetween, 1562 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1563 const TargetRegisterInfo *TRI) { 1564 if (FirstLoad.isPseudo()) 1565 return false; 1566 1567 UsedInBetween.accumulate(FirstLoad); 1568 auto RegToRename = getLdStRegOp(FirstLoad).getReg(); 1569 bool Success = std::all_of( 1570 FirstLoad.getIterator(), SecondLoad.getIterator(), 1571 [&](MachineInstr &MI) { 1572 LLVM_DEBUG(dbgs() << "Checking " << MI); 1573 // Currently we do not try to rename across frame-setup instructions. 1574 if (MI.getFlag(MachineInstr::FrameSetup)) { 1575 LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions " 1576 << "currently\n"); 1577 return false; 1578 } 1579 1580 for (auto &MOP : MI.operands()) { 1581 if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || 1582 !TRI->regsOverlap(MOP.getReg(), RegToRename)) 1583 continue; 1584 if (!canRenameMOP(MOP, TRI)) { 1585 LLVM_DEBUG(dbgs() << " Cannot rename " << MOP << " in " << MI); 1586 return false; 1587 } 1588 RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); 1589 } 1590 1591 return true; 1592 }); 1593 return Success; 1594 } 1595 1596 // Check if we can find a physical register for renaming \p Reg. This register 1597 // must: 1598 // * not be defined already in \p DefinedInBB; DefinedInBB must contain all 1599 // defined registers up to the point where the renamed register will be used, 1600 // * not used in \p UsedInBetween; UsedInBetween must contain all accessed 1601 // registers in the range the rename register will be used, 1602 // * is available in all used register classes (checked using RequiredClasses). 1603 static std::optional<MCPhysReg> tryToFindRegisterToRename( 1604 const MachineFunction &MF, Register Reg, LiveRegUnits &DefinedInBB, 1605 LiveRegUnits &UsedInBetween, 1606 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1607 const TargetRegisterInfo *TRI) { 1608 const MachineRegisterInfo &RegInfo = MF.getRegInfo(); 1609 1610 // Checks if any sub- or super-register of PR is callee saved. 1611 auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { 1612 return any_of(TRI->sub_and_superregs_inclusive(PR), 1613 [&MF, TRI](MCPhysReg SubOrSuper) { 1614 return TRI->isCalleeSavedPhysReg(SubOrSuper, MF); 1615 }); 1616 }; 1617 1618 // Check if PR or one of its sub- or super-registers can be used for all 1619 // required register classes. 1620 auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) { 1621 return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) { 1622 return any_of( 1623 TRI->sub_and_superregs_inclusive(PR), 1624 [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); }); 1625 }); 1626 }; 1627 1628 auto *RegClass = TRI->getMinimalPhysRegClass(Reg); 1629 for (const MCPhysReg &PR : *RegClass) { 1630 if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && 1631 !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && 1632 CanBeUsedForAllClasses(PR)) { 1633 DefinedInBB.addReg(PR); 1634 LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI) 1635 << "\n"); 1636 return {PR}; 1637 } 1638 } 1639 LLVM_DEBUG(dbgs() << "No rename register found from " 1640 << TRI->getRegClassName(RegClass) << "\n"); 1641 return std::nullopt; 1642 } 1643 1644 // For store pairs: returns a register from FirstMI to the beginning of the 1645 // block that can be renamed. 1646 // For load pairs: returns a register from FirstMI to MI that can be renamed. 1647 static std::optional<MCPhysReg> findRenameRegForSameLdStRegPair( 1648 std::optional<bool> MaybeCanRename, MachineInstr &FirstMI, MachineInstr &MI, 1649 Register Reg, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween, 1650 SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, 1651 const TargetRegisterInfo *TRI) { 1652 std::optional<MCPhysReg> RenameReg; 1653 if (!DebugCounter::shouldExecute(RegRenamingCounter)) 1654 return RenameReg; 1655 1656 auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); 1657 MachineFunction &MF = *FirstMI.getParent()->getParent(); 1658 if (!RegClass || !MF.getRegInfo().tracksLiveness()) 1659 return RenameReg; 1660 1661 const bool IsLoad = FirstMI.mayLoad(); 1662 1663 if (!MaybeCanRename) { 1664 if (IsLoad) 1665 MaybeCanRename = {canRenameUntilSecondLoad(FirstMI, MI, UsedInBetween, 1666 RequiredClasses, TRI)}; 1667 else 1668 MaybeCanRename = { 1669 canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)}; 1670 } 1671 1672 if (*MaybeCanRename) { 1673 RenameReg = tryToFindRegisterToRename(MF, Reg, DefinedInBB, UsedInBetween, 1674 RequiredClasses, TRI); 1675 } 1676 return RenameReg; 1677 } 1678 1679 /// Scan the instructions looking for a load/store that can be combined with the 1680 /// current instruction into a wider equivalent or a load/store pair. 1681 MachineBasicBlock::iterator 1682 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1683 LdStPairFlags &Flags, unsigned Limit, 1684 bool FindNarrowMerge) { 1685 MachineBasicBlock::iterator E = I->getParent()->end(); 1686 MachineBasicBlock::iterator MBBI = I; 1687 MachineBasicBlock::iterator MBBIWithRenameReg; 1688 MachineInstr &FirstMI = *I; 1689 MBBI = next_nodbg(MBBI, E); 1690 1691 bool MayLoad = FirstMI.mayLoad(); 1692 bool IsUnscaled = TII->hasUnscaledLdStOffset(FirstMI); 1693 Register Reg = getLdStRegOp(FirstMI).getReg(); 1694 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(FirstMI).getReg(); 1695 int Offset = AArch64InstrInfo::getLdStOffsetOp(FirstMI).getImm(); 1696 int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; 1697 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1698 1699 std::optional<bool> MaybeCanRename; 1700 if (!EnableRenaming) 1701 MaybeCanRename = {false}; 1702 1703 SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; 1704 LiveRegUnits UsedInBetween; 1705 UsedInBetween.init(*TRI); 1706 1707 Flags.clearRenameReg(); 1708 1709 // Track which register units have been modified and used between the first 1710 // insn (inclusive) and the second insn. 1711 ModifiedRegUnits.clear(); 1712 UsedRegUnits.clear(); 1713 1714 // Remember any instructions that read/write memory between FirstMI and MI. 1715 SmallVector<MachineInstr *, 4> MemInsns; 1716 1717 LLVM_DEBUG(dbgs() << "Find match for: "; FirstMI.dump()); 1718 for (unsigned Count = 0; MBBI != E && Count < Limit; 1719 MBBI = next_nodbg(MBBI, E)) { 1720 MachineInstr &MI = *MBBI; 1721 LLVM_DEBUG(dbgs() << "Analysing 2nd insn: "; MI.dump()); 1722 1723 UsedInBetween.accumulate(MI); 1724 1725 // Don't count transient instructions towards the search limit since there 1726 // may be different numbers of them if e.g. debug information is present. 1727 if (!MI.isTransient()) 1728 ++Count; 1729 1730 Flags.setSExtIdx(-1); 1731 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && 1732 AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) { 1733 assert(MI.mayLoadOrStore() && "Expected memory operation."); 1734 // If we've found another instruction with the same opcode, check to see 1735 // if the base and offset are compatible with our starting instruction. 1736 // These instructions all have scaled immediate operands, so we just 1737 // check for +1/-1. Make sure to check the new instruction offset is 1738 // actually an immediate and not a symbolic reference destined for 1739 // a relocation. 1740 Register MIBaseReg = AArch64InstrInfo::getLdStBaseOp(MI).getReg(); 1741 int MIOffset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 1742 bool MIIsUnscaled = TII->hasUnscaledLdStOffset(MI); 1743 if (IsUnscaled != MIIsUnscaled) { 1744 // We're trying to pair instructions that differ in how they are scaled. 1745 // If FirstMI is scaled then scale the offset of MI accordingly. 1746 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1747 int MemSize = TII->getMemScale(MI); 1748 if (MIIsUnscaled) { 1749 // If the unscaled offset isn't a multiple of the MemSize, we can't 1750 // pair the operations together: bail and keep looking. 1751 if (MIOffset % MemSize) { 1752 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1753 UsedRegUnits, TRI); 1754 MemInsns.push_back(&MI); 1755 continue; 1756 } 1757 MIOffset /= MemSize; 1758 } else { 1759 MIOffset *= MemSize; 1760 } 1761 } 1762 1763 bool IsPreLdSt = isPreLdStPairCandidate(FirstMI, MI); 1764 1765 if (BaseReg == MIBaseReg) { 1766 // If the offset of the second ld/st is not equal to the size of the 1767 // destination register it can’t be paired with a pre-index ld/st 1768 // pair. Additionally if the base reg is used or modified the operations 1769 // can't be paired: bail and keep looking. 1770 if (IsPreLdSt) { 1771 bool IsOutOfBounds = MIOffset != TII->getMemScale(MI); 1772 bool IsBaseRegUsed = !UsedRegUnits.available( 1773 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1774 bool IsBaseRegModified = !ModifiedRegUnits.available( 1775 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1776 // If the stored value and the address of the second instruction is 1777 // the same, it needs to be using the updated register and therefore 1778 // it must not be folded. 1779 bool IsMIRegTheSame = 1780 TRI->regsOverlap(getLdStRegOp(MI).getReg(), 1781 AArch64InstrInfo::getLdStBaseOp(MI).getReg()); 1782 if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified || 1783 IsMIRegTheSame) { 1784 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1785 UsedRegUnits, TRI); 1786 MemInsns.push_back(&MI); 1787 continue; 1788 } 1789 } else { 1790 if ((Offset != MIOffset + OffsetStride) && 1791 (Offset + OffsetStride != MIOffset)) { 1792 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1793 UsedRegUnits, TRI); 1794 MemInsns.push_back(&MI); 1795 continue; 1796 } 1797 } 1798 1799 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1800 if (FindNarrowMerge) { 1801 // If the alignment requirements of the scaled wide load/store 1802 // instruction can't express the offset of the scaled narrow input, 1803 // bail and keep looking. For promotable zero stores, allow only when 1804 // the stored value is the same (i.e., WZR). 1805 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || 1806 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1807 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1808 UsedRegUnits, TRI); 1809 MemInsns.push_back(&MI); 1810 continue; 1811 } 1812 } else { 1813 // Pairwise instructions have a 7-bit signed offset field. Single 1814 // insns have a 12-bit unsigned offset field. If the resultant 1815 // immediate offset of merging these instructions is out of range for 1816 // a pairwise instruction, bail and keep looking. 1817 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1818 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1819 UsedRegUnits, TRI); 1820 MemInsns.push_back(&MI); 1821 LLVM_DEBUG(dbgs() << "Offset doesn't fit in immediate, " 1822 << "keep looking.\n"); 1823 continue; 1824 } 1825 // If the alignment requirements of the paired (scaled) instruction 1826 // can't express the offset of the unscaled input, bail and keep 1827 // looking. 1828 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1829 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1830 UsedRegUnits, TRI); 1831 MemInsns.push_back(&MI); 1832 LLVM_DEBUG(dbgs() 1833 << "Offset doesn't fit due to alignment requirements, " 1834 << "keep looking.\n"); 1835 continue; 1836 } 1837 } 1838 1839 // If the BaseReg has been modified, then we cannot do the optimization. 1840 // For example, in the following pattern 1841 // ldr x1 [x2] 1842 // ldr x2 [x3] 1843 // ldr x4 [x2, #8], 1844 // the first and third ldr cannot be converted to ldp x1, x4, [x2] 1845 if (!ModifiedRegUnits.available(BaseReg)) 1846 return E; 1847 1848 const bool SameLoadReg = MayLoad && TRI->isSuperOrSubRegisterEq( 1849 Reg, getLdStRegOp(MI).getReg()); 1850 1851 // If the Rt of the second instruction (destination register of the 1852 // load) was not modified or used between the two instructions and none 1853 // of the instructions between the second and first alias with the 1854 // second, we can combine the second into the first. 1855 bool RtNotModified = 1856 ModifiedRegUnits.available(getLdStRegOp(MI).getReg()); 1857 bool RtNotUsed = !(MI.mayLoad() && !SameLoadReg && 1858 !UsedRegUnits.available(getLdStRegOp(MI).getReg())); 1859 1860 LLVM_DEBUG(dbgs() << "Checking, can combine 2nd into 1st insn:\n" 1861 << "Reg '" << getLdStRegOp(MI) << "' not modified: " 1862 << (RtNotModified ? "true" : "false") << "\n" 1863 << "Reg '" << getLdStRegOp(MI) << "' not used: " 1864 << (RtNotUsed ? "true" : "false") << "\n"); 1865 1866 if (RtNotModified && RtNotUsed && !mayAlias(MI, MemInsns, AA)) { 1867 // For pairs loading into the same reg, try to find a renaming 1868 // opportunity to allow the renaming of Reg between FirstMI and MI 1869 // and combine MI into FirstMI; otherwise bail and keep looking. 1870 if (SameLoadReg) { 1871 std::optional<MCPhysReg> RenameReg = 1872 findRenameRegForSameLdStRegPair(MaybeCanRename, FirstMI, MI, 1873 Reg, DefinedInBB, UsedInBetween, 1874 RequiredClasses, TRI); 1875 if (!RenameReg) { 1876 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1877 UsedRegUnits, TRI); 1878 MemInsns.push_back(&MI); 1879 LLVM_DEBUG(dbgs() << "Can't find reg for renaming, " 1880 << "keep looking.\n"); 1881 continue; 1882 } 1883 Flags.setRenameReg(*RenameReg); 1884 } 1885 1886 Flags.setMergeForward(false); 1887 if (!SameLoadReg) 1888 Flags.clearRenameReg(); 1889 return MBBI; 1890 } 1891 1892 // Likewise, if the Rt of the first instruction is not modified or used 1893 // between the two instructions and none of the instructions between the 1894 // first and the second alias with the first, we can combine the first 1895 // into the second. 1896 RtNotModified = !( 1897 MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())); 1898 1899 LLVM_DEBUG(dbgs() << "Checking, can combine 1st into 2nd insn:\n" 1900 << "Reg '" << getLdStRegOp(FirstMI) 1901 << "' not modified: " 1902 << (RtNotModified ? "true" : "false") << "\n"); 1903 1904 if (RtNotModified && !mayAlias(FirstMI, MemInsns, AA)) { 1905 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) { 1906 Flags.setMergeForward(true); 1907 Flags.clearRenameReg(); 1908 return MBBI; 1909 } 1910 1911 std::optional<MCPhysReg> RenameReg = findRenameRegForSameLdStRegPair( 1912 MaybeCanRename, FirstMI, MI, Reg, DefinedInBB, UsedInBetween, 1913 RequiredClasses, TRI); 1914 if (RenameReg) { 1915 Flags.setMergeForward(true); 1916 Flags.setRenameReg(*RenameReg); 1917 MBBIWithRenameReg = MBBI; 1918 } 1919 } 1920 LLVM_DEBUG(dbgs() << "Unable to combine these instructions due to " 1921 << "interference in between, keep looking.\n"); 1922 } 1923 } 1924 1925 if (Flags.getRenameReg()) 1926 return MBBIWithRenameReg; 1927 1928 // If the instruction wasn't a matching load or store. Stop searching if we 1929 // encounter a call instruction that might modify memory. 1930 if (MI.isCall()) { 1931 LLVM_DEBUG(dbgs() << "Found a call, stop looking.\n"); 1932 return E; 1933 } 1934 1935 // Update modified / uses register units. 1936 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1937 1938 // Otherwise, if the base register is modified, we have no match, so 1939 // return early. 1940 if (!ModifiedRegUnits.available(BaseReg)) { 1941 LLVM_DEBUG(dbgs() << "Base reg is modified, stop looking.\n"); 1942 return E; 1943 } 1944 1945 // Update list of instructions that read/write memory. 1946 if (MI.mayLoadOrStore()) 1947 MemInsns.push_back(&MI); 1948 } 1949 return E; 1950 } 1951 1952 static MachineBasicBlock::iterator 1953 maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) { 1954 auto End = MI.getParent()->end(); 1955 if (MaybeCFI == End || 1956 MaybeCFI->getOpcode() != TargetOpcode::CFI_INSTRUCTION || 1957 !(MI.getFlag(MachineInstr::FrameSetup) || 1958 MI.getFlag(MachineInstr::FrameDestroy)) || 1959 AArch64InstrInfo::getLdStBaseOp(MI).getReg() != AArch64::SP) 1960 return End; 1961 1962 const MachineFunction &MF = *MI.getParent()->getParent(); 1963 unsigned CFIIndex = MaybeCFI->getOperand(0).getCFIIndex(); 1964 const MCCFIInstruction &CFI = MF.getFrameInstructions()[CFIIndex]; 1965 switch (CFI.getOperation()) { 1966 case MCCFIInstruction::OpDefCfa: 1967 case MCCFIInstruction::OpDefCfaOffset: 1968 return MaybeCFI; 1969 default: 1970 return End; 1971 } 1972 } 1973 1974 MachineBasicBlock::iterator 1975 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1976 MachineBasicBlock::iterator Update, 1977 bool IsPreIdx) { 1978 assert((Update->getOpcode() == AArch64::ADDXri || 1979 Update->getOpcode() == AArch64::SUBXri) && 1980 "Unexpected base register update instruction to merge!"); 1981 MachineBasicBlock::iterator E = I->getParent()->end(); 1982 MachineBasicBlock::iterator NextI = next_nodbg(I, E); 1983 1984 // If updating the SP and the following instruction is CFA offset related CFI 1985 // instruction move it after the merged instruction. 1986 MachineBasicBlock::iterator CFI = 1987 IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E; 1988 1989 // Return the instruction following the merged instruction, which is 1990 // the instruction following our unmerged load. Unless that's the add/sub 1991 // instruction we're merging, in which case it's the one after that. 1992 if (NextI == Update) 1993 NextI = next_nodbg(NextI, E); 1994 1995 int Value = Update->getOperand(2).getImm(); 1996 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 1997 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 1998 if (Update->getOpcode() == AArch64::SUBXri) 1999 Value = -Value; 2000 2001 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 2002 : getPostIndexedOpcode(I->getOpcode()); 2003 MachineInstrBuilder MIB; 2004 int Scale, MinOffset, MaxOffset; 2005 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); 2006 if (!AArch64InstrInfo::isPairedLdSt(*I)) { 2007 // Non-paired instruction. 2008 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2009 .add(getLdStRegOp(*Update)) 2010 .add(getLdStRegOp(*I)) 2011 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2012 .addImm(Value / Scale) 2013 .setMemRefs(I->memoperands()) 2014 .setMIFlags(I->mergeFlagsWith(*Update)); 2015 } else { 2016 // Paired instruction. 2017 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 2018 .add(getLdStRegOp(*Update)) 2019 .add(getLdStRegOp(*I, 0)) 2020 .add(getLdStRegOp(*I, 1)) 2021 .add(AArch64InstrInfo::getLdStBaseOp(*I)) 2022 .addImm(Value / Scale) 2023 .setMemRefs(I->memoperands()) 2024 .setMIFlags(I->mergeFlagsWith(*Update)); 2025 } 2026 if (CFI != E) { 2027 MachineBasicBlock *MBB = I->getParent(); 2028 MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI); 2029 } 2030 2031 if (IsPreIdx) { 2032 ++NumPreFolded; 2033 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); 2034 } else { 2035 ++NumPostFolded; 2036 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); 2037 } 2038 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 2039 LLVM_DEBUG(I->print(dbgs())); 2040 LLVM_DEBUG(dbgs() << " "); 2041 LLVM_DEBUG(Update->print(dbgs())); 2042 LLVM_DEBUG(dbgs() << " with instruction:\n "); 2043 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 2044 LLVM_DEBUG(dbgs() << "\n"); 2045 2046 // Erase the old instructions for the block. 2047 I->eraseFromParent(); 2048 Update->eraseFromParent(); 2049 2050 return NextI; 2051 } 2052 2053 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, 2054 MachineInstr &MI, 2055 unsigned BaseReg, int Offset) { 2056 switch (MI.getOpcode()) { 2057 default: 2058 break; 2059 case AArch64::SUBXri: 2060 case AArch64::ADDXri: 2061 // Make sure it's a vanilla immediate operand, not a relocation or 2062 // anything else we can't handle. 2063 if (!MI.getOperand(2).isImm()) 2064 break; 2065 // Watch out for 1 << 12 shifted value. 2066 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) 2067 break; 2068 2069 // The update instruction source and destination register must be the 2070 // same as the load/store base register. 2071 if (MI.getOperand(0).getReg() != BaseReg || 2072 MI.getOperand(1).getReg() != BaseReg) 2073 break; 2074 2075 int UpdateOffset = MI.getOperand(2).getImm(); 2076 if (MI.getOpcode() == AArch64::SUBXri) 2077 UpdateOffset = -UpdateOffset; 2078 2079 // The immediate must be a multiple of the scaling factor of the pre/post 2080 // indexed instruction. 2081 int Scale, MinOffset, MaxOffset; 2082 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); 2083 if (UpdateOffset % Scale != 0) 2084 break; 2085 2086 // Scaled offset must fit in the instruction immediate. 2087 int ScaledOffset = UpdateOffset / Scale; 2088 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) 2089 break; 2090 2091 // If we have a non-zero Offset, we check that it matches the amount 2092 // we're adding to the register. 2093 if (!Offset || Offset == UpdateOffset) 2094 return true; 2095 break; 2096 } 2097 return false; 2098 } 2099 2100 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 2101 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 2102 MachineBasicBlock::iterator E = I->getParent()->end(); 2103 MachineInstr &MemMI = *I; 2104 MachineBasicBlock::iterator MBBI = I; 2105 2106 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2107 int MIUnscaledOffset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm() * 2108 TII->getMemScale(MemMI); 2109 2110 // Scan forward looking for post-index opportunities. Updating instructions 2111 // can't be formed if the memory instruction doesn't have the offset we're 2112 // looking for. 2113 if (MIUnscaledOffset != UnscaledOffset) 2114 return E; 2115 2116 // If the base register overlaps a source/destination register, we can't 2117 // merge the update. This does not apply to tag store instructions which 2118 // ignore the address part of the source register. 2119 // This does not apply to STGPi as well, which does not have unpredictable 2120 // behavior in this case unlike normal stores, and always performs writeback 2121 // after reading the source register value. 2122 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { 2123 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2124 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2125 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2126 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2127 return E; 2128 } 2129 } 2130 2131 // Track which register units have been modified and used between the first 2132 // insn (inclusive) and the second insn. 2133 ModifiedRegUnits.clear(); 2134 UsedRegUnits.clear(); 2135 MBBI = next_nodbg(MBBI, E); 2136 2137 // We can't post-increment the stack pointer if any instruction between 2138 // the memory access (I) and the increment (MBBI) can access the memory 2139 // region defined by [SP, MBBI]. 2140 const bool BaseRegSP = BaseReg == AArch64::SP; 2141 if (BaseRegSP && needsWinCFI(I->getMF())) { 2142 // FIXME: For now, we always block the optimization over SP in windows 2143 // targets as it requires to adjust the unwind/debug info, messing up 2144 // the unwind info can actually cause a miscompile. 2145 return E; 2146 } 2147 2148 for (unsigned Count = 0; MBBI != E && Count < Limit; 2149 MBBI = next_nodbg(MBBI, E)) { 2150 MachineInstr &MI = *MBBI; 2151 2152 // Don't count transient instructions towards the search limit since there 2153 // may be different numbers of them if e.g. debug information is present. 2154 if (!MI.isTransient()) 2155 ++Count; 2156 2157 // If we found a match, return it. 2158 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) 2159 return MBBI; 2160 2161 // Update the status of what the instruction clobbered and used. 2162 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2163 2164 // Otherwise, if the base register is used or modified, we have no match, so 2165 // return early. 2166 // If we are optimizing SP, do not allow instructions that may load or store 2167 // in between the load and the optimized value update. 2168 if (!ModifiedRegUnits.available(BaseReg) || 2169 !UsedRegUnits.available(BaseReg) || 2170 (BaseRegSP && MBBI->mayLoadOrStore())) 2171 return E; 2172 } 2173 return E; 2174 } 2175 2176 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 2177 MachineBasicBlock::iterator I, unsigned Limit) { 2178 MachineBasicBlock::iterator B = I->getParent()->begin(); 2179 MachineBasicBlock::iterator E = I->getParent()->end(); 2180 MachineInstr &MemMI = *I; 2181 MachineBasicBlock::iterator MBBI = I; 2182 MachineFunction &MF = *MemMI.getMF(); 2183 2184 Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg(); 2185 int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm(); 2186 2187 // If the load/store is the first instruction in the block, there's obviously 2188 // not any matching update. Ditto if the memory offset isn't zero. 2189 if (MBBI == B || Offset != 0) 2190 return E; 2191 // If the base register overlaps a destination register, we can't 2192 // merge the update. 2193 if (!isTagStore(MemMI)) { 2194 bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); 2195 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 2196 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 2197 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 2198 return E; 2199 } 2200 } 2201 2202 const bool BaseRegSP = BaseReg == AArch64::SP; 2203 if (BaseRegSP && needsWinCFI(I->getMF())) { 2204 // FIXME: For now, we always block the optimization over SP in windows 2205 // targets as it requires to adjust the unwind/debug info, messing up 2206 // the unwind info can actually cause a miscompile. 2207 return E; 2208 } 2209 2210 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 2211 unsigned RedZoneSize = 2212 Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); 2213 2214 // Track which register units have been modified and used between the first 2215 // insn (inclusive) and the second insn. 2216 ModifiedRegUnits.clear(); 2217 UsedRegUnits.clear(); 2218 unsigned Count = 0; 2219 bool MemAcessBeforeSPPreInc = false; 2220 do { 2221 MBBI = prev_nodbg(MBBI, B); 2222 MachineInstr &MI = *MBBI; 2223 2224 // Don't count transient instructions towards the search limit since there 2225 // may be different numbers of them if e.g. debug information is present. 2226 if (!MI.isTransient()) 2227 ++Count; 2228 2229 // If we found a match, return it. 2230 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) { 2231 // Check that the update value is within our red zone limit (which may be 2232 // zero). 2233 if (MemAcessBeforeSPPreInc && MBBI->getOperand(2).getImm() > RedZoneSize) 2234 return E; 2235 return MBBI; 2236 } 2237 2238 // Update the status of what the instruction clobbered and used. 2239 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 2240 2241 // Otherwise, if the base register is used or modified, we have no match, so 2242 // return early. 2243 if (!ModifiedRegUnits.available(BaseReg) || 2244 !UsedRegUnits.available(BaseReg)) 2245 return E; 2246 // Keep track if we have a memory access before an SP pre-increment, in this 2247 // case we need to validate later that the update amount respects the red 2248 // zone. 2249 if (BaseRegSP && MBBI->mayLoadOrStore()) 2250 MemAcessBeforeSPPreInc = true; 2251 } while (MBBI != B && Count < Limit); 2252 return E; 2253 } 2254 2255 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 2256 MachineBasicBlock::iterator &MBBI) { 2257 MachineInstr &MI = *MBBI; 2258 // If this is a volatile load, don't mess with it. 2259 if (MI.hasOrderedMemoryRef()) 2260 return false; 2261 2262 if (needsWinCFI(MI.getMF()) && MI.getFlag(MachineInstr::FrameDestroy)) 2263 return false; 2264 2265 // Make sure this is a reg+imm. 2266 // FIXME: It is possible to extend it to handle reg+reg cases. 2267 if (!AArch64InstrInfo::getLdStOffsetOp(MI).isImm()) 2268 return false; 2269 2270 // Look backward up to LdStLimit instructions. 2271 MachineBasicBlock::iterator StoreI; 2272 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 2273 ++NumLoadsFromStoresPromoted; 2274 // Promote the load. Keeping the iterator straight is a 2275 // pain, so we let the merge routine tell us what the next instruction 2276 // is after it's done mucking about. 2277 MBBI = promoteLoadFromStore(MBBI, StoreI); 2278 return true; 2279 } 2280 return false; 2281 } 2282 2283 // Merge adjacent zero stores into a wider store. 2284 bool AArch64LoadStoreOpt::tryToMergeZeroStInst( 2285 MachineBasicBlock::iterator &MBBI) { 2286 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); 2287 MachineInstr &MI = *MBBI; 2288 MachineBasicBlock::iterator E = MI.getParent()->end(); 2289 2290 if (!TII->isCandidateToMergeOrPair(MI)) 2291 return false; 2292 2293 // Look ahead up to LdStLimit instructions for a mergable instruction. 2294 LdStPairFlags Flags; 2295 MachineBasicBlock::iterator MergeMI = 2296 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); 2297 if (MergeMI != E) { 2298 ++NumZeroStoresPromoted; 2299 2300 // Keeping the iterator straight is a pain, so we let the merge routine tell 2301 // us what the next instruction is after it's done mucking about. 2302 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); 2303 return true; 2304 } 2305 return false; 2306 } 2307 2308 // Find loads and stores that can be merged into a single load or store pair 2309 // instruction. 2310 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 2311 MachineInstr &MI = *MBBI; 2312 MachineBasicBlock::iterator E = MI.getParent()->end(); 2313 2314 if (!TII->isCandidateToMergeOrPair(MI)) 2315 return false; 2316 2317 // If disable-ldp feature is opted, do not emit ldp. 2318 if (MI.mayLoad() && Subtarget->hasDisableLdp()) 2319 return false; 2320 2321 // If disable-stp feature is opted, do not emit stp. 2322 if (MI.mayStore() && Subtarget->hasDisableStp()) 2323 return false; 2324 2325 // Early exit if the offset is not possible to match. (6 bits of positive 2326 // range, plus allow an extra one in case we find a later insn that matches 2327 // with Offset-1) 2328 bool IsUnscaled = TII->hasUnscaledLdStOffset(MI); 2329 int Offset = AArch64InstrInfo::getLdStOffsetOp(MI).getImm(); 2330 int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; 2331 // Allow one more for offset. 2332 if (Offset > 0) 2333 Offset -= OffsetStride; 2334 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 2335 return false; 2336 2337 // Look ahead up to LdStLimit instructions for a pairable instruction. 2338 LdStPairFlags Flags; 2339 MachineBasicBlock::iterator Paired = 2340 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); 2341 if (Paired != E) { 2342 // Keeping the iterator straight is a pain, so we let the merge routine tell 2343 // us what the next instruction is after it's done mucking about. 2344 auto Prev = std::prev(MBBI); 2345 2346 // Fetch the memoperand of the load/store that is a candidate for 2347 // combination. 2348 MachineMemOperand *MemOp = 2349 MI.memoperands_empty() ? nullptr : MI.memoperands().front(); 2350 2351 // If a load/store arrives and ldp/stp-aligned-only feature is opted, check 2352 // that the alignment of the source pointer is at least double the alignment 2353 // of the type. 2354 if ((MI.mayLoad() && Subtarget->hasLdpAlignedOnly()) || 2355 (MI.mayStore() && Subtarget->hasStpAlignedOnly())) { 2356 // If there is no size/align information, cancel the transformation. 2357 if (!MemOp || !MemOp->getMemoryType().isValid()) { 2358 NumFailedAlignmentCheck++; 2359 return false; 2360 } 2361 2362 // Get the needed alignments to check them if 2363 // ldp-aligned-only/stp-aligned-only features are opted. 2364 uint64_t MemAlignment = MemOp->getAlign().value(); 2365 uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value(); 2366 2367 if (MemAlignment < 2 * TypeAlignment) { 2368 NumFailedAlignmentCheck++; 2369 return false; 2370 } 2371 } 2372 2373 ++NumPairCreated; 2374 if (TII->hasUnscaledLdStOffset(MI)) 2375 ++NumUnscaledPairCreated; 2376 2377 MBBI = mergePairedInsns(MBBI, Paired, Flags); 2378 // Collect liveness info for instructions between Prev and the new position 2379 // MBBI. 2380 for (auto I = std::next(Prev); I != MBBI; I++) 2381 updateDefinedRegisters(*I, DefinedInBB, TRI); 2382 2383 return true; 2384 } 2385 return false; 2386 } 2387 2388 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate 2389 (MachineBasicBlock::iterator &MBBI) { 2390 MachineInstr &MI = *MBBI; 2391 MachineBasicBlock::iterator E = MI.getParent()->end(); 2392 MachineBasicBlock::iterator Update; 2393 2394 // Look forward to try to form a post-index instruction. For example, 2395 // ldr x0, [x20] 2396 // add x20, x20, #32 2397 // merged into: 2398 // ldr x0, [x20], #32 2399 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 2400 if (Update != E) { 2401 // Merge the update into the ld/st. 2402 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 2403 return true; 2404 } 2405 2406 // Don't know how to handle unscaled pre/post-index versions below, so bail. 2407 if (TII->hasUnscaledLdStOffset(MI.getOpcode())) 2408 return false; 2409 2410 // Look back to try to find a pre-index instruction. For example, 2411 // add x0, x0, #8 2412 // ldr x1, [x0] 2413 // merged into: 2414 // ldr x1, [x0, #8]! 2415 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 2416 if (Update != E) { 2417 // Merge the update into the ld/st. 2418 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2419 return true; 2420 } 2421 2422 // The immediate in the load/store is scaled by the size of the memory 2423 // operation. The immediate in the add we're looking for, 2424 // however, is not, so adjust here. 2425 int UnscaledOffset = 2426 AArch64InstrInfo::getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); 2427 2428 // Look forward to try to find a pre-index instruction. For example, 2429 // ldr x1, [x0, #64] 2430 // add x0, x0, #64 2431 // merged into: 2432 // ldr x1, [x0, #64]! 2433 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 2434 if (Update != E) { 2435 // Merge the update into the ld/st. 2436 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 2437 return true; 2438 } 2439 2440 return false; 2441 } 2442 2443 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 2444 bool EnableNarrowZeroStOpt) { 2445 2446 bool Modified = false; 2447 // Four tranformations to do here: 2448 // 1) Find loads that directly read from stores and promote them by 2449 // replacing with mov instructions. If the store is wider than the load, 2450 // the load will be replaced with a bitfield extract. 2451 // e.g., 2452 // str w1, [x0, #4] 2453 // ldrh w2, [x0, #6] 2454 // ; becomes 2455 // str w1, [x0, #4] 2456 // lsr w2, w1, #16 2457 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2458 MBBI != E;) { 2459 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) 2460 Modified = true; 2461 else 2462 ++MBBI; 2463 } 2464 // 2) Merge adjacent zero stores into a wider store. 2465 // e.g., 2466 // strh wzr, [x0] 2467 // strh wzr, [x0, #2] 2468 // ; becomes 2469 // str wzr, [x0] 2470 // e.g., 2471 // str wzr, [x0] 2472 // str wzr, [x0, #4] 2473 // ; becomes 2474 // str xzr, [x0] 2475 if (EnableNarrowZeroStOpt) 2476 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2477 MBBI != E;) { 2478 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) 2479 Modified = true; 2480 else 2481 ++MBBI; 2482 } 2483 // 3) Find loads and stores that can be merged into a single load or store 2484 // pair instruction. 2485 // e.g., 2486 // ldr x0, [x2] 2487 // ldr x1, [x2, #8] 2488 // ; becomes 2489 // ldp x0, x1, [x2] 2490 2491 if (MBB.getParent()->getRegInfo().tracksLiveness()) { 2492 DefinedInBB.clear(); 2493 DefinedInBB.addLiveIns(MBB); 2494 } 2495 2496 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2497 MBBI != E;) { 2498 // Track currently live registers up to this point, to help with 2499 // searching for a rename register on demand. 2500 updateDefinedRegisters(*MBBI, DefinedInBB, TRI); 2501 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) 2502 Modified = true; 2503 else 2504 ++MBBI; 2505 } 2506 // 4) Find base register updates that can be merged into the load or store 2507 // as a base-reg writeback. 2508 // e.g., 2509 // ldr x0, [x2] 2510 // add x2, x2, #4 2511 // ; becomes 2512 // ldr x0, [x2], #4 2513 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 2514 MBBI != E;) { 2515 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) 2516 Modified = true; 2517 else 2518 ++MBBI; 2519 } 2520 2521 return Modified; 2522 } 2523 2524 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 2525 if (skipFunction(Fn.getFunction())) 2526 return false; 2527 2528 Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); 2529 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 2530 TRI = Subtarget->getRegisterInfo(); 2531 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2532 2533 // Resize the modified and used register unit trackers. We do this once 2534 // per function and then clear the register units each time we optimize a load 2535 // or store. 2536 ModifiedRegUnits.init(*TRI); 2537 UsedRegUnits.init(*TRI); 2538 DefinedInBB.init(*TRI); 2539 2540 bool Modified = false; 2541 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); 2542 for (auto &MBB : Fn) { 2543 auto M = optimizeBlock(MBB, enableNarrowZeroStOpt); 2544 Modified |= M; 2545 } 2546 2547 return Modified; 2548 } 2549 2550 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and 2551 // stores near one another? Note: The pre-RA instruction scheduler already has 2552 // hooks to try and schedule pairable loads/stores together to improve pairing 2553 // opportunities. Thus, pre-RA pairing pass may not be worth the effort. 2554 2555 // FIXME: When pairing store instructions it's very possible for this pass to 2556 // hoist a store with a KILL marker above another use (without a KILL marker). 2557 // The resulting IR is invalid, but nothing uses the KILL markers after this 2558 // pass, so it's never caused a problem in practice. 2559 2560 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 2561 /// load / store optimization pass. 2562 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 2563 return new AArch64LoadStoreOpt(); 2564 } 2565