1 //===- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains a pass that performs load / store related peephole 10 // optimizations. This pass should be run after register allocation. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AArch64InstrInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "llvm/ADT/BitVector.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/iterator_range.h" 22 #include "llvm/Analysis/AliasAnalysis.h" 23 #include "llvm/CodeGen/MachineBasicBlock.h" 24 #include "llvm/CodeGen/MachineFunction.h" 25 #include "llvm/CodeGen/MachineFunctionPass.h" 26 #include "llvm/CodeGen/MachineInstr.h" 27 #include "llvm/CodeGen/MachineInstrBuilder.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/CodeGen/TargetRegisterInfo.h" 30 #include "llvm/IR/DebugLoc.h" 31 #include "llvm/MC/MCRegisterInfo.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Support/CommandLine.h" 34 #include "llvm/Support/Debug.h" 35 #include "llvm/Support/ErrorHandling.h" 36 #include "llvm/Support/raw_ostream.h" 37 #include <cassert> 38 #include <cstdint> 39 #include <iterator> 40 #include <limits> 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "aarch64-ldst-opt" 45 46 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); 47 STATISTIC(NumPostFolded, "Number of post-index updates folded"); 48 STATISTIC(NumPreFolded, "Number of pre-index updates folded"); 49 STATISTIC(NumUnscaledPairCreated, 50 "Number of load/store from unscaled generated"); 51 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); 52 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); 53 54 // The LdStLimit limits how far we search for load/store pairs. 55 static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", 56 cl::init(20), cl::Hidden); 57 58 // The UpdateLimit limits how far we search for update instructions when we form 59 // pre-/post-index instructions. 60 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100), 61 cl::Hidden); 62 63 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" 64 65 namespace { 66 67 using LdStPairFlags = struct LdStPairFlags { 68 // If a matching instruction is found, MergeForward is set to true if the 69 // merge is to remove the first instruction and replace the second with 70 // a pair-wise insn, and false if the reverse is true. 71 bool MergeForward = false; 72 73 // SExtIdx gives the index of the result of the load pair that must be 74 // extended. The value of SExtIdx assumes that the paired load produces the 75 // value in this order: (I, returned iterator), i.e., -1 means no value has 76 // to be extended, 0 means I, and 1 means the returned iterator. 77 int SExtIdx = -1; 78 79 LdStPairFlags() = default; 80 81 void setMergeForward(bool V = true) { MergeForward = V; } 82 bool getMergeForward() const { return MergeForward; } 83 84 void setSExtIdx(int V) { SExtIdx = V; } 85 int getSExtIdx() const { return SExtIdx; } 86 }; 87 88 struct AArch64LoadStoreOpt : public MachineFunctionPass { 89 static char ID; 90 91 AArch64LoadStoreOpt() : MachineFunctionPass(ID) { 92 initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); 93 } 94 95 AliasAnalysis *AA; 96 const AArch64InstrInfo *TII; 97 const TargetRegisterInfo *TRI; 98 const AArch64Subtarget *Subtarget; 99 100 // Track which register units have been modified and used. 101 LiveRegUnits ModifiedRegUnits, UsedRegUnits; 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.addRequired<AAResultsWrapperPass>(); 105 MachineFunctionPass::getAnalysisUsage(AU); 106 } 107 108 // Scan the instructions looking for a load/store that can be combined 109 // with the current instruction into a load/store pair. 110 // Return the matching instruction if one is found, else MBB->end(). 111 MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, 112 LdStPairFlags &Flags, 113 unsigned Limit, 114 bool FindNarrowMerge); 115 116 // Scan the instructions looking for a store that writes to the address from 117 // which the current load instruction reads. Return true if one is found. 118 bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, 119 MachineBasicBlock::iterator &StoreI); 120 121 // Merge the two instructions indicated into a wider narrow store instruction. 122 MachineBasicBlock::iterator 123 mergeNarrowZeroStores(MachineBasicBlock::iterator I, 124 MachineBasicBlock::iterator MergeMI, 125 const LdStPairFlags &Flags); 126 127 // Merge the two instructions indicated into a single pair-wise instruction. 128 MachineBasicBlock::iterator 129 mergePairedInsns(MachineBasicBlock::iterator I, 130 MachineBasicBlock::iterator Paired, 131 const LdStPairFlags &Flags); 132 133 // Promote the load that reads directly from the address stored to. 134 MachineBasicBlock::iterator 135 promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 136 MachineBasicBlock::iterator StoreI); 137 138 // Scan the instruction list to find a base register update that can 139 // be combined with the current instruction (a load or store) using 140 // pre or post indexed addressing with writeback. Scan forwards. 141 MachineBasicBlock::iterator 142 findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, 143 int UnscaledOffset, unsigned Limit); 144 145 // Scan the instruction list to find a base register update that can 146 // be combined with the current instruction (a load or store) using 147 // pre or post indexed addressing with writeback. Scan backwards. 148 MachineBasicBlock::iterator 149 findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); 150 151 // Find an instruction that updates the base register of the ld/st 152 // instruction. 153 bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, 154 unsigned BaseReg, int Offset); 155 156 // Merge a pre- or post-index base register update into a ld/st instruction. 157 MachineBasicBlock::iterator 158 mergeUpdateInsn(MachineBasicBlock::iterator I, 159 MachineBasicBlock::iterator Update, bool IsPreIdx); 160 161 // Find and merge zero store instructions. 162 bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI); 163 164 // Find and pair ldr/str instructions. 165 bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI); 166 167 // Find and promote load instructions which read directly from store. 168 bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); 169 170 // Find and merge a base register updates before or after a ld/st instruction. 171 bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); 172 173 bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); 174 175 bool runOnMachineFunction(MachineFunction &Fn) override; 176 177 MachineFunctionProperties getRequiredProperties() const override { 178 return MachineFunctionProperties().set( 179 MachineFunctionProperties::Property::NoVRegs); 180 } 181 182 StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; } 183 }; 184 185 char AArch64LoadStoreOpt::ID = 0; 186 187 } // end anonymous namespace 188 189 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", 190 AARCH64_LOAD_STORE_OPT_NAME, false, false) 191 192 static bool isNarrowStore(unsigned Opc) { 193 switch (Opc) { 194 default: 195 return false; 196 case AArch64::STRBBui: 197 case AArch64::STURBBi: 198 case AArch64::STRHHui: 199 case AArch64::STURHHi: 200 return true; 201 } 202 } 203 204 // These instruction set memory tag and either keep memory contents unchanged or 205 // set it to zero, ignoring the address part of the source register. 206 static bool isTagStore(const MachineInstr &MI) { 207 switch (MI.getOpcode()) { 208 default: 209 return false; 210 case AArch64::STGOffset: 211 case AArch64::STZGOffset: 212 case AArch64::ST2GOffset: 213 case AArch64::STZ2GOffset: 214 return true; 215 } 216 } 217 218 // Scaling factor for unscaled load or store. 219 static int getMemScale(const MachineInstr &MI) { 220 switch (MI.getOpcode()) { 221 default: 222 llvm_unreachable("Opcode has unknown scale!"); 223 case AArch64::LDRBBui: 224 case AArch64::LDURBBi: 225 case AArch64::LDRSBWui: 226 case AArch64::LDURSBWi: 227 case AArch64::STRBBui: 228 case AArch64::STURBBi: 229 return 1; 230 case AArch64::LDRHHui: 231 case AArch64::LDURHHi: 232 case AArch64::LDRSHWui: 233 case AArch64::LDURSHWi: 234 case AArch64::STRHHui: 235 case AArch64::STURHHi: 236 return 2; 237 case AArch64::LDRSui: 238 case AArch64::LDURSi: 239 case AArch64::LDRSWui: 240 case AArch64::LDURSWi: 241 case AArch64::LDRWui: 242 case AArch64::LDURWi: 243 case AArch64::STRSui: 244 case AArch64::STURSi: 245 case AArch64::STRWui: 246 case AArch64::STURWi: 247 case AArch64::LDPSi: 248 case AArch64::LDPSWi: 249 case AArch64::LDPWi: 250 case AArch64::STPSi: 251 case AArch64::STPWi: 252 return 4; 253 case AArch64::LDRDui: 254 case AArch64::LDURDi: 255 case AArch64::LDRXui: 256 case AArch64::LDURXi: 257 case AArch64::STRDui: 258 case AArch64::STURDi: 259 case AArch64::STRXui: 260 case AArch64::STURXi: 261 case AArch64::LDPDi: 262 case AArch64::LDPXi: 263 case AArch64::STPDi: 264 case AArch64::STPXi: 265 return 8; 266 case AArch64::LDRQui: 267 case AArch64::LDURQi: 268 case AArch64::STRQui: 269 case AArch64::STURQi: 270 case AArch64::LDPQi: 271 case AArch64::STPQi: 272 case AArch64::STGOffset: 273 case AArch64::STZGOffset: 274 case AArch64::ST2GOffset: 275 case AArch64::STZ2GOffset: 276 case AArch64::STGPi: 277 return 16; 278 } 279 } 280 281 static unsigned getMatchingNonSExtOpcode(unsigned Opc, 282 bool *IsValidLdStrOpc = nullptr) { 283 if (IsValidLdStrOpc) 284 *IsValidLdStrOpc = true; 285 switch (Opc) { 286 default: 287 if (IsValidLdStrOpc) 288 *IsValidLdStrOpc = false; 289 return std::numeric_limits<unsigned>::max(); 290 case AArch64::STRDui: 291 case AArch64::STURDi: 292 case AArch64::STRQui: 293 case AArch64::STURQi: 294 case AArch64::STRBBui: 295 case AArch64::STURBBi: 296 case AArch64::STRHHui: 297 case AArch64::STURHHi: 298 case AArch64::STRWui: 299 case AArch64::STURWi: 300 case AArch64::STRXui: 301 case AArch64::STURXi: 302 case AArch64::LDRDui: 303 case AArch64::LDURDi: 304 case AArch64::LDRQui: 305 case AArch64::LDURQi: 306 case AArch64::LDRWui: 307 case AArch64::LDURWi: 308 case AArch64::LDRXui: 309 case AArch64::LDURXi: 310 case AArch64::STRSui: 311 case AArch64::STURSi: 312 case AArch64::LDRSui: 313 case AArch64::LDURSi: 314 return Opc; 315 case AArch64::LDRSWui: 316 return AArch64::LDRWui; 317 case AArch64::LDURSWi: 318 return AArch64::LDURWi; 319 } 320 } 321 322 static unsigned getMatchingWideOpcode(unsigned Opc) { 323 switch (Opc) { 324 default: 325 llvm_unreachable("Opcode has no wide equivalent!"); 326 case AArch64::STRBBui: 327 return AArch64::STRHHui; 328 case AArch64::STRHHui: 329 return AArch64::STRWui; 330 case AArch64::STURBBi: 331 return AArch64::STURHHi; 332 case AArch64::STURHHi: 333 return AArch64::STURWi; 334 case AArch64::STURWi: 335 return AArch64::STURXi; 336 case AArch64::STRWui: 337 return AArch64::STRXui; 338 } 339 } 340 341 static unsigned getMatchingPairOpcode(unsigned Opc) { 342 switch (Opc) { 343 default: 344 llvm_unreachable("Opcode has no pairwise equivalent!"); 345 case AArch64::STRSui: 346 case AArch64::STURSi: 347 return AArch64::STPSi; 348 case AArch64::STRDui: 349 case AArch64::STURDi: 350 return AArch64::STPDi; 351 case AArch64::STRQui: 352 case AArch64::STURQi: 353 return AArch64::STPQi; 354 case AArch64::STRWui: 355 case AArch64::STURWi: 356 return AArch64::STPWi; 357 case AArch64::STRXui: 358 case AArch64::STURXi: 359 return AArch64::STPXi; 360 case AArch64::LDRSui: 361 case AArch64::LDURSi: 362 return AArch64::LDPSi; 363 case AArch64::LDRDui: 364 case AArch64::LDURDi: 365 return AArch64::LDPDi; 366 case AArch64::LDRQui: 367 case AArch64::LDURQi: 368 return AArch64::LDPQi; 369 case AArch64::LDRWui: 370 case AArch64::LDURWi: 371 return AArch64::LDPWi; 372 case AArch64::LDRXui: 373 case AArch64::LDURXi: 374 return AArch64::LDPXi; 375 case AArch64::LDRSWui: 376 case AArch64::LDURSWi: 377 return AArch64::LDPSWi; 378 } 379 } 380 381 static unsigned isMatchingStore(MachineInstr &LoadInst, 382 MachineInstr &StoreInst) { 383 unsigned LdOpc = LoadInst.getOpcode(); 384 unsigned StOpc = StoreInst.getOpcode(); 385 switch (LdOpc) { 386 default: 387 llvm_unreachable("Unsupported load instruction!"); 388 case AArch64::LDRBBui: 389 return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || 390 StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 391 case AArch64::LDURBBi: 392 return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || 393 StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 394 case AArch64::LDRHHui: 395 return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || 396 StOpc == AArch64::STRXui; 397 case AArch64::LDURHHi: 398 return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || 399 StOpc == AArch64::STURXi; 400 case AArch64::LDRWui: 401 return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; 402 case AArch64::LDURWi: 403 return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; 404 case AArch64::LDRXui: 405 return StOpc == AArch64::STRXui; 406 case AArch64::LDURXi: 407 return StOpc == AArch64::STURXi; 408 } 409 } 410 411 static unsigned getPreIndexedOpcode(unsigned Opc) { 412 // FIXME: We don't currently support creating pre-indexed loads/stores when 413 // the load or store is the unscaled version. If we decide to perform such an 414 // optimization in the future the cases for the unscaled loads/stores will 415 // need to be added here. 416 switch (Opc) { 417 default: 418 llvm_unreachable("Opcode has no pre-indexed equivalent!"); 419 case AArch64::STRSui: 420 return AArch64::STRSpre; 421 case AArch64::STRDui: 422 return AArch64::STRDpre; 423 case AArch64::STRQui: 424 return AArch64::STRQpre; 425 case AArch64::STRBBui: 426 return AArch64::STRBBpre; 427 case AArch64::STRHHui: 428 return AArch64::STRHHpre; 429 case AArch64::STRWui: 430 return AArch64::STRWpre; 431 case AArch64::STRXui: 432 return AArch64::STRXpre; 433 case AArch64::LDRSui: 434 return AArch64::LDRSpre; 435 case AArch64::LDRDui: 436 return AArch64::LDRDpre; 437 case AArch64::LDRQui: 438 return AArch64::LDRQpre; 439 case AArch64::LDRBBui: 440 return AArch64::LDRBBpre; 441 case AArch64::LDRHHui: 442 return AArch64::LDRHHpre; 443 case AArch64::LDRWui: 444 return AArch64::LDRWpre; 445 case AArch64::LDRXui: 446 return AArch64::LDRXpre; 447 case AArch64::LDRSWui: 448 return AArch64::LDRSWpre; 449 case AArch64::LDPSi: 450 return AArch64::LDPSpre; 451 case AArch64::LDPSWi: 452 return AArch64::LDPSWpre; 453 case AArch64::LDPDi: 454 return AArch64::LDPDpre; 455 case AArch64::LDPQi: 456 return AArch64::LDPQpre; 457 case AArch64::LDPWi: 458 return AArch64::LDPWpre; 459 case AArch64::LDPXi: 460 return AArch64::LDPXpre; 461 case AArch64::STPSi: 462 return AArch64::STPSpre; 463 case AArch64::STPDi: 464 return AArch64::STPDpre; 465 case AArch64::STPQi: 466 return AArch64::STPQpre; 467 case AArch64::STPWi: 468 return AArch64::STPWpre; 469 case AArch64::STPXi: 470 return AArch64::STPXpre; 471 case AArch64::STGOffset: 472 return AArch64::STGPreIndex; 473 case AArch64::STZGOffset: 474 return AArch64::STZGPreIndex; 475 case AArch64::ST2GOffset: 476 return AArch64::ST2GPreIndex; 477 case AArch64::STZ2GOffset: 478 return AArch64::STZ2GPreIndex; 479 case AArch64::STGPi: 480 return AArch64::STGPpre; 481 } 482 } 483 484 static unsigned getPostIndexedOpcode(unsigned Opc) { 485 switch (Opc) { 486 default: 487 llvm_unreachable("Opcode has no post-indexed wise equivalent!"); 488 case AArch64::STRSui: 489 case AArch64::STURSi: 490 return AArch64::STRSpost; 491 case AArch64::STRDui: 492 case AArch64::STURDi: 493 return AArch64::STRDpost; 494 case AArch64::STRQui: 495 case AArch64::STURQi: 496 return AArch64::STRQpost; 497 case AArch64::STRBBui: 498 return AArch64::STRBBpost; 499 case AArch64::STRHHui: 500 return AArch64::STRHHpost; 501 case AArch64::STRWui: 502 case AArch64::STURWi: 503 return AArch64::STRWpost; 504 case AArch64::STRXui: 505 case AArch64::STURXi: 506 return AArch64::STRXpost; 507 case AArch64::LDRSui: 508 case AArch64::LDURSi: 509 return AArch64::LDRSpost; 510 case AArch64::LDRDui: 511 case AArch64::LDURDi: 512 return AArch64::LDRDpost; 513 case AArch64::LDRQui: 514 case AArch64::LDURQi: 515 return AArch64::LDRQpost; 516 case AArch64::LDRBBui: 517 return AArch64::LDRBBpost; 518 case AArch64::LDRHHui: 519 return AArch64::LDRHHpost; 520 case AArch64::LDRWui: 521 case AArch64::LDURWi: 522 return AArch64::LDRWpost; 523 case AArch64::LDRXui: 524 case AArch64::LDURXi: 525 return AArch64::LDRXpost; 526 case AArch64::LDRSWui: 527 return AArch64::LDRSWpost; 528 case AArch64::LDPSi: 529 return AArch64::LDPSpost; 530 case AArch64::LDPSWi: 531 return AArch64::LDPSWpost; 532 case AArch64::LDPDi: 533 return AArch64::LDPDpost; 534 case AArch64::LDPQi: 535 return AArch64::LDPQpost; 536 case AArch64::LDPWi: 537 return AArch64::LDPWpost; 538 case AArch64::LDPXi: 539 return AArch64::LDPXpost; 540 case AArch64::STPSi: 541 return AArch64::STPSpost; 542 case AArch64::STPDi: 543 return AArch64::STPDpost; 544 case AArch64::STPQi: 545 return AArch64::STPQpost; 546 case AArch64::STPWi: 547 return AArch64::STPWpost; 548 case AArch64::STPXi: 549 return AArch64::STPXpost; 550 case AArch64::STGOffset: 551 return AArch64::STGPostIndex; 552 case AArch64::STZGOffset: 553 return AArch64::STZGPostIndex; 554 case AArch64::ST2GOffset: 555 return AArch64::ST2GPostIndex; 556 case AArch64::STZ2GOffset: 557 return AArch64::STZ2GPostIndex; 558 case AArch64::STGPi: 559 return AArch64::STGPpost; 560 } 561 } 562 563 static bool isPairedLdSt(const MachineInstr &MI) { 564 switch (MI.getOpcode()) { 565 default: 566 return false; 567 case AArch64::LDPSi: 568 case AArch64::LDPSWi: 569 case AArch64::LDPDi: 570 case AArch64::LDPQi: 571 case AArch64::LDPWi: 572 case AArch64::LDPXi: 573 case AArch64::STPSi: 574 case AArch64::STPDi: 575 case AArch64::STPQi: 576 case AArch64::STPWi: 577 case AArch64::STPXi: 578 case AArch64::STGPi: 579 return true; 580 } 581 } 582 583 // Returns the scale and offset range of pre/post indexed variants of MI. 584 static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, 585 int &MinOffset, int &MaxOffset) { 586 bool IsPaired = isPairedLdSt(MI); 587 bool IsTagStore = isTagStore(MI); 588 // ST*G and all paired ldst have the same scale in pre/post-indexed variants 589 // as in the "unsigned offset" variant. 590 // All other pre/post indexed ldst instructions are unscaled. 591 Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; 592 593 if (IsPaired) { 594 MinOffset = -64; 595 MaxOffset = 63; 596 } else { 597 MinOffset = -256; 598 MaxOffset = 255; 599 } 600 } 601 602 static const MachineOperand &getLdStRegOp(const MachineInstr &MI, 603 unsigned PairedRegOp = 0) { 604 assert(PairedRegOp < 2 && "Unexpected register operand idx."); 605 unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; 606 return MI.getOperand(Idx); 607 } 608 609 static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { 610 unsigned Idx = isPairedLdSt(MI) ? 2 : 1; 611 return MI.getOperand(Idx); 612 } 613 614 static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { 615 unsigned Idx = isPairedLdSt(MI) ? 3 : 2; 616 return MI.getOperand(Idx); 617 } 618 619 static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, 620 MachineInstr &StoreInst, 621 const AArch64InstrInfo *TII) { 622 assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); 623 int LoadSize = getMemScale(LoadInst); 624 int StoreSize = getMemScale(StoreInst); 625 int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) 626 ? getLdStOffsetOp(StoreInst).getImm() 627 : getLdStOffsetOp(StoreInst).getImm() * StoreSize; 628 int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst) 629 ? getLdStOffsetOp(LoadInst).getImm() 630 : getLdStOffsetOp(LoadInst).getImm() * LoadSize; 631 return (UnscaledStOffset <= UnscaledLdOffset) && 632 (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); 633 } 634 635 static bool isPromotableZeroStoreInst(MachineInstr &MI) { 636 unsigned Opc = MI.getOpcode(); 637 return (Opc == AArch64::STRWui || Opc == AArch64::STURWi || 638 isNarrowStore(Opc)) && 639 getLdStRegOp(MI).getReg() == AArch64::WZR; 640 } 641 642 static bool isPromotableLoadFromStore(MachineInstr &MI) { 643 switch (MI.getOpcode()) { 644 default: 645 return false; 646 // Scaled instructions. 647 case AArch64::LDRBBui: 648 case AArch64::LDRHHui: 649 case AArch64::LDRWui: 650 case AArch64::LDRXui: 651 // Unscaled instructions. 652 case AArch64::LDURBBi: 653 case AArch64::LDURHHi: 654 case AArch64::LDURWi: 655 case AArch64::LDURXi: 656 return true; 657 } 658 } 659 660 static bool isMergeableLdStUpdate(MachineInstr &MI) { 661 unsigned Opc = MI.getOpcode(); 662 switch (Opc) { 663 default: 664 return false; 665 // Scaled instructions. 666 case AArch64::STRSui: 667 case AArch64::STRDui: 668 case AArch64::STRQui: 669 case AArch64::STRXui: 670 case AArch64::STRWui: 671 case AArch64::STRHHui: 672 case AArch64::STRBBui: 673 case AArch64::LDRSui: 674 case AArch64::LDRDui: 675 case AArch64::LDRQui: 676 case AArch64::LDRXui: 677 case AArch64::LDRWui: 678 case AArch64::LDRHHui: 679 case AArch64::LDRBBui: 680 case AArch64::STGOffset: 681 case AArch64::STZGOffset: 682 case AArch64::ST2GOffset: 683 case AArch64::STZ2GOffset: 684 case AArch64::STGPi: 685 // Unscaled instructions. 686 case AArch64::STURSi: 687 case AArch64::STURDi: 688 case AArch64::STURQi: 689 case AArch64::STURWi: 690 case AArch64::STURXi: 691 case AArch64::LDURSi: 692 case AArch64::LDURDi: 693 case AArch64::LDURQi: 694 case AArch64::LDURWi: 695 case AArch64::LDURXi: 696 // Paired instructions. 697 case AArch64::LDPSi: 698 case AArch64::LDPSWi: 699 case AArch64::LDPDi: 700 case AArch64::LDPQi: 701 case AArch64::LDPWi: 702 case AArch64::LDPXi: 703 case AArch64::STPSi: 704 case AArch64::STPDi: 705 case AArch64::STPQi: 706 case AArch64::STPWi: 707 case AArch64::STPXi: 708 // Make sure this is a reg+imm (as opposed to an address reloc). 709 if (!getLdStOffsetOp(MI).isImm()) 710 return false; 711 712 return true; 713 } 714 } 715 716 MachineBasicBlock::iterator 717 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, 718 MachineBasicBlock::iterator MergeMI, 719 const LdStPairFlags &Flags) { 720 assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) && 721 "Expected promotable zero stores."); 722 723 MachineBasicBlock::iterator NextI = I; 724 ++NextI; 725 // If NextI is the second of the two instructions to be merged, we need 726 // to skip one further. Either way we merge will invalidate the iterator, 727 // and we don't need to scan the new instruction, as it's a pairwise 728 // instruction, which we're not considering for further action anyway. 729 if (NextI == MergeMI) 730 ++NextI; 731 732 unsigned Opc = I->getOpcode(); 733 bool IsScaled = !TII->isUnscaledLdSt(Opc); 734 int OffsetStride = IsScaled ? 1 : getMemScale(*I); 735 736 bool MergeForward = Flags.getMergeForward(); 737 // Insert our new paired instruction after whichever of the paired 738 // instructions MergeForward indicates. 739 MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I; 740 // Also based on MergeForward is from where we copy the base register operand 741 // so we get the flags compatible with the input code. 742 const MachineOperand &BaseRegOp = 743 MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I); 744 745 // Which register is Rt and which is Rt2 depends on the offset order. 746 MachineInstr *RtMI; 747 if (getLdStOffsetOp(*I).getImm() == 748 getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) 749 RtMI = &*MergeMI; 750 else 751 RtMI = &*I; 752 753 int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); 754 // Change the scaled offset from small to large type. 755 if (IsScaled) { 756 assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); 757 OffsetImm /= 2; 758 } 759 760 // Construct the new instruction. 761 DebugLoc DL = I->getDebugLoc(); 762 MachineBasicBlock *MBB = I->getParent(); 763 MachineInstrBuilder MIB; 764 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc))) 765 .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR) 766 .add(BaseRegOp) 767 .addImm(OffsetImm) 768 .cloneMergedMemRefs({&*I, &*MergeMI}) 769 .setMIFlags(I->mergeFlagsWith(*MergeMI)); 770 (void)MIB; 771 772 LLVM_DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n "); 773 LLVM_DEBUG(I->print(dbgs())); 774 LLVM_DEBUG(dbgs() << " "); 775 LLVM_DEBUG(MergeMI->print(dbgs())); 776 LLVM_DEBUG(dbgs() << " with instruction:\n "); 777 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 778 LLVM_DEBUG(dbgs() << "\n"); 779 780 // Erase the old instructions. 781 I->eraseFromParent(); 782 MergeMI->eraseFromParent(); 783 return NextI; 784 } 785 786 MachineBasicBlock::iterator 787 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, 788 MachineBasicBlock::iterator Paired, 789 const LdStPairFlags &Flags) { 790 MachineBasicBlock::iterator NextI = I; 791 ++NextI; 792 // If NextI is the second of the two instructions to be merged, we need 793 // to skip one further. Either way we merge will invalidate the iterator, 794 // and we don't need to scan the new instruction, as it's a pairwise 795 // instruction, which we're not considering for further action anyway. 796 if (NextI == Paired) 797 ++NextI; 798 799 int SExtIdx = Flags.getSExtIdx(); 800 unsigned Opc = 801 SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); 802 bool IsUnscaled = TII->isUnscaledLdSt(Opc); 803 int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; 804 805 bool MergeForward = Flags.getMergeForward(); 806 // Insert our new paired instruction after whichever of the paired 807 // instructions MergeForward indicates. 808 MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; 809 // Also based on MergeForward is from where we copy the base register operand 810 // so we get the flags compatible with the input code. 811 const MachineOperand &BaseRegOp = 812 MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I); 813 814 int Offset = getLdStOffsetOp(*I).getImm(); 815 int PairedOffset = getLdStOffsetOp(*Paired).getImm(); 816 bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode()); 817 if (IsUnscaled != PairedIsUnscaled) { 818 // We're trying to pair instructions that differ in how they are scaled. If 819 // I is scaled then scale the offset of Paired accordingly. Otherwise, do 820 // the opposite (i.e., make Paired's offset unscaled). 821 int MemSize = getMemScale(*Paired); 822 if (PairedIsUnscaled) { 823 // If the unscaled offset isn't a multiple of the MemSize, we can't 824 // pair the operations together. 825 assert(!(PairedOffset % getMemScale(*Paired)) && 826 "Offset should be a multiple of the stride!"); 827 PairedOffset /= MemSize; 828 } else { 829 PairedOffset *= MemSize; 830 } 831 } 832 833 // Which register is Rt and which is Rt2 depends on the offset order. 834 MachineInstr *RtMI, *Rt2MI; 835 if (Offset == PairedOffset + OffsetStride) { 836 RtMI = &*Paired; 837 Rt2MI = &*I; 838 // Here we swapped the assumption made for SExtIdx. 839 // I.e., we turn ldp I, Paired into ldp Paired, I. 840 // Update the index accordingly. 841 if (SExtIdx != -1) 842 SExtIdx = (SExtIdx + 1) % 2; 843 } else { 844 RtMI = &*I; 845 Rt2MI = &*Paired; 846 } 847 int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); 848 // Scale the immediate offset, if necessary. 849 if (TII->isUnscaledLdSt(RtMI->getOpcode())) { 850 assert(!(OffsetImm % getMemScale(*RtMI)) && 851 "Unscaled offset cannot be scaled."); 852 OffsetImm /= getMemScale(*RtMI); 853 } 854 855 // Construct the new instruction. 856 MachineInstrBuilder MIB; 857 DebugLoc DL = I->getDebugLoc(); 858 MachineBasicBlock *MBB = I->getParent(); 859 MachineOperand RegOp0 = getLdStRegOp(*RtMI); 860 MachineOperand RegOp1 = getLdStRegOp(*Rt2MI); 861 // Kill flags may become invalid when moving stores for pairing. 862 if (RegOp0.isUse()) { 863 if (!MergeForward) { 864 // Clear kill flags on store if moving upwards. Example: 865 // STRWui %w0, ... 866 // USE %w1 867 // STRWui kill %w1 ; need to clear kill flag when moving STRWui upwards 868 RegOp0.setIsKill(false); 869 RegOp1.setIsKill(false); 870 } else { 871 // Clear kill flags of the first stores register. Example: 872 // STRWui %w1, ... 873 // USE kill %w1 ; need to clear kill flag when moving STRWui downwards 874 // STRW %w0 875 Register Reg = getLdStRegOp(*I).getReg(); 876 for (MachineInstr &MI : make_range(std::next(I), Paired)) 877 MI.clearRegisterKills(Reg, TRI); 878 } 879 } 880 MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc))) 881 .add(RegOp0) 882 .add(RegOp1) 883 .add(BaseRegOp) 884 .addImm(OffsetImm) 885 .cloneMergedMemRefs({&*I, &*Paired}) 886 .setMIFlags(I->mergeFlagsWith(*Paired)); 887 888 (void)MIB; 889 890 LLVM_DEBUG( 891 dbgs() << "Creating pair load/store. Replacing instructions:\n "); 892 LLVM_DEBUG(I->print(dbgs())); 893 LLVM_DEBUG(dbgs() << " "); 894 LLVM_DEBUG(Paired->print(dbgs())); 895 LLVM_DEBUG(dbgs() << " with instruction:\n "); 896 if (SExtIdx != -1) { 897 // Generate the sign extension for the proper result of the ldp. 898 // I.e., with X1, that would be: 899 // %w1 = KILL %w1, implicit-def %x1 900 // %x1 = SBFMXri killed %x1, 0, 31 901 MachineOperand &DstMO = MIB->getOperand(SExtIdx); 902 // Right now, DstMO has the extended register, since it comes from an 903 // extended opcode. 904 Register DstRegX = DstMO.getReg(); 905 // Get the W variant of that register. 906 Register DstRegW = TRI->getSubReg(DstRegX, AArch64::sub_32); 907 // Update the result of LDP to use the W instead of the X variant. 908 DstMO.setReg(DstRegW); 909 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 910 LLVM_DEBUG(dbgs() << "\n"); 911 // Make the machine verifier happy by providing a definition for 912 // the X register. 913 // Insert this definition right after the generated LDP, i.e., before 914 // InsertionPoint. 915 MachineInstrBuilder MIBKill = 916 BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW) 917 .addReg(DstRegW) 918 .addReg(DstRegX, RegState::Define); 919 MIBKill->getOperand(2).setImplicit(); 920 // Create the sign extension. 921 MachineInstrBuilder MIBSXTW = 922 BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX) 923 .addReg(DstRegX) 924 .addImm(0) 925 .addImm(31); 926 (void)MIBSXTW; 927 LLVM_DEBUG(dbgs() << " Extend operand:\n "); 928 LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs())); 929 } else { 930 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 931 } 932 LLVM_DEBUG(dbgs() << "\n"); 933 934 // Erase the old instructions. 935 I->eraseFromParent(); 936 Paired->eraseFromParent(); 937 938 return NextI; 939 } 940 941 MachineBasicBlock::iterator 942 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, 943 MachineBasicBlock::iterator StoreI) { 944 MachineBasicBlock::iterator NextI = LoadI; 945 ++NextI; 946 947 int LoadSize = getMemScale(*LoadI); 948 int StoreSize = getMemScale(*StoreI); 949 Register LdRt = getLdStRegOp(*LoadI).getReg(); 950 const MachineOperand &StMO = getLdStRegOp(*StoreI); 951 Register StRt = getLdStRegOp(*StoreI).getReg(); 952 bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); 953 954 assert((IsStoreXReg || 955 TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && 956 "Unexpected RegClass"); 957 958 MachineInstr *BitExtMI; 959 if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { 960 // Remove the load, if the destination register of the loads is the same 961 // register for stored value. 962 if (StRt == LdRt && LoadSize == 8) { 963 for (MachineInstr &MI : make_range(StoreI->getIterator(), 964 LoadI->getIterator())) { 965 if (MI.killsRegister(StRt, TRI)) { 966 MI.clearRegisterKills(StRt, TRI); 967 break; 968 } 969 } 970 LLVM_DEBUG(dbgs() << "Remove load instruction:\n "); 971 LLVM_DEBUG(LoadI->print(dbgs())); 972 LLVM_DEBUG(dbgs() << "\n"); 973 LoadI->eraseFromParent(); 974 return NextI; 975 } 976 // Replace the load with a mov if the load and store are in the same size. 977 BitExtMI = 978 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 979 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) 980 .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) 981 .add(StMO) 982 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 983 .setMIFlags(LoadI->getFlags()); 984 } else { 985 // FIXME: Currently we disable this transformation in big-endian targets as 986 // performance and correctness are verified only in little-endian. 987 if (!Subtarget->isLittleEndian()) 988 return NextI; 989 bool IsUnscaled = TII->isUnscaledLdSt(*LoadI); 990 assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) && 991 "Unsupported ld/st match"); 992 assert(LoadSize <= StoreSize && "Invalid load size"); 993 int UnscaledLdOffset = IsUnscaled 994 ? getLdStOffsetOp(*LoadI).getImm() 995 : getLdStOffsetOp(*LoadI).getImm() * LoadSize; 996 int UnscaledStOffset = IsUnscaled 997 ? getLdStOffsetOp(*StoreI).getImm() 998 : getLdStOffsetOp(*StoreI).getImm() * StoreSize; 999 int Width = LoadSize * 8; 1000 unsigned DestReg = 1001 IsStoreXReg ? Register(TRI->getMatchingSuperReg( 1002 LdRt, AArch64::sub_32, &AArch64::GPR64RegClass)) 1003 : LdRt; 1004 1005 assert((UnscaledLdOffset >= UnscaledStOffset && 1006 (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && 1007 "Invalid offset"); 1008 1009 int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); 1010 int Imms = Immr + Width - 1; 1011 if (UnscaledLdOffset == UnscaledStOffset) { 1012 uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N 1013 | ((Immr) << 6) // immr 1014 | ((Imms) << 0) // imms 1015 ; 1016 1017 BitExtMI = 1018 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1019 TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), 1020 DestReg) 1021 .add(StMO) 1022 .addImm(AndMaskEncoded) 1023 .setMIFlags(LoadI->getFlags()); 1024 } else { 1025 BitExtMI = 1026 BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), 1027 TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), 1028 DestReg) 1029 .add(StMO) 1030 .addImm(Immr) 1031 .addImm(Imms) 1032 .setMIFlags(LoadI->getFlags()); 1033 } 1034 } 1035 1036 // Clear kill flags between store and load. 1037 for (MachineInstr &MI : make_range(StoreI->getIterator(), 1038 BitExtMI->getIterator())) 1039 if (MI.killsRegister(StRt, TRI)) { 1040 MI.clearRegisterKills(StRt, TRI); 1041 break; 1042 } 1043 1044 LLVM_DEBUG(dbgs() << "Promoting load by replacing :\n "); 1045 LLVM_DEBUG(StoreI->print(dbgs())); 1046 LLVM_DEBUG(dbgs() << " "); 1047 LLVM_DEBUG(LoadI->print(dbgs())); 1048 LLVM_DEBUG(dbgs() << " with instructions:\n "); 1049 LLVM_DEBUG(StoreI->print(dbgs())); 1050 LLVM_DEBUG(dbgs() << " "); 1051 LLVM_DEBUG((BitExtMI)->print(dbgs())); 1052 LLVM_DEBUG(dbgs() << "\n"); 1053 1054 // Erase the old instructions. 1055 LoadI->eraseFromParent(); 1056 return NextI; 1057 } 1058 1059 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { 1060 // Convert the byte-offset used by unscaled into an "element" offset used 1061 // by the scaled pair load/store instructions. 1062 if (IsUnscaled) { 1063 // If the byte-offset isn't a multiple of the stride, there's no point 1064 // trying to match it. 1065 if (Offset % OffsetStride) 1066 return false; 1067 Offset /= OffsetStride; 1068 } 1069 return Offset <= 63 && Offset >= -64; 1070 } 1071 1072 // Do alignment, specialized to power of 2 and for signed ints, 1073 // avoiding having to do a C-style cast from uint_64t to int when 1074 // using alignTo from include/llvm/Support/MathExtras.h. 1075 // FIXME: Move this function to include/MathExtras.h? 1076 static int alignTo(int Num, int PowOf2) { 1077 return (Num + PowOf2 - 1) & ~(PowOf2 - 1); 1078 } 1079 1080 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb, 1081 AliasAnalysis *AA) { 1082 // One of the instructions must modify memory. 1083 if (!MIa.mayStore() && !MIb.mayStore()) 1084 return false; 1085 1086 // Both instructions must be memory operations. 1087 if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore()) 1088 return false; 1089 1090 return MIa.mayAlias(AA, MIb, /*UseTBAA*/false); 1091 } 1092 1093 static bool mayAlias(MachineInstr &MIa, 1094 SmallVectorImpl<MachineInstr *> &MemInsns, 1095 AliasAnalysis *AA) { 1096 for (MachineInstr *MIb : MemInsns) 1097 if (mayAlias(MIa, *MIb, AA)) 1098 return true; 1099 1100 return false; 1101 } 1102 1103 bool AArch64LoadStoreOpt::findMatchingStore( 1104 MachineBasicBlock::iterator I, unsigned Limit, 1105 MachineBasicBlock::iterator &StoreI) { 1106 MachineBasicBlock::iterator B = I->getParent()->begin(); 1107 MachineBasicBlock::iterator MBBI = I; 1108 MachineInstr &LoadMI = *I; 1109 Register BaseReg = getLdStBaseOp(LoadMI).getReg(); 1110 1111 // If the load is the first instruction in the block, there's obviously 1112 // not any matching store. 1113 if (MBBI == B) 1114 return false; 1115 1116 // Track which register units have been modified and used between the first 1117 // insn and the second insn. 1118 ModifiedRegUnits.clear(); 1119 UsedRegUnits.clear(); 1120 1121 unsigned Count = 0; 1122 do { 1123 --MBBI; 1124 MachineInstr &MI = *MBBI; 1125 1126 // Don't count transient instructions towards the search limit since there 1127 // may be different numbers of them if e.g. debug information is present. 1128 if (!MI.isTransient()) 1129 ++Count; 1130 1131 // If the load instruction reads directly from the address to which the 1132 // store instruction writes and the stored value is not modified, we can 1133 // promote the load. Since we do not handle stores with pre-/post-index, 1134 // it's unnecessary to check if BaseReg is modified by the store itself. 1135 if (MI.mayStore() && isMatchingStore(LoadMI, MI) && 1136 BaseReg == getLdStBaseOp(MI).getReg() && 1137 isLdOffsetInRangeOfSt(LoadMI, MI, TII) && 1138 ModifiedRegUnits.available(getLdStRegOp(MI).getReg())) { 1139 StoreI = MBBI; 1140 return true; 1141 } 1142 1143 if (MI.isCall()) 1144 return false; 1145 1146 // Update modified / uses register units. 1147 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1148 1149 // Otherwise, if the base register is modified, we have no match, so 1150 // return early. 1151 if (!ModifiedRegUnits.available(BaseReg)) 1152 return false; 1153 1154 // If we encounter a store aliased with the load, return early. 1155 if (MI.mayStore() && mayAlias(LoadMI, MI, AA)) 1156 return false; 1157 } while (MBBI != B && Count < Limit); 1158 return false; 1159 } 1160 1161 // Returns true if FirstMI and MI are candidates for merging or pairing. 1162 // Otherwise, returns false. 1163 static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, 1164 LdStPairFlags &Flags, 1165 const AArch64InstrInfo *TII) { 1166 // If this is volatile or if pairing is suppressed, not a candidate. 1167 if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) 1168 return false; 1169 1170 // We should have already checked FirstMI for pair suppression and volatility. 1171 assert(!FirstMI.hasOrderedMemoryRef() && 1172 !TII->isLdStPairSuppressed(FirstMI) && 1173 "FirstMI shouldn't get here if either of these checks are true."); 1174 1175 unsigned OpcA = FirstMI.getOpcode(); 1176 unsigned OpcB = MI.getOpcode(); 1177 1178 // Opcodes match: nothing more to check. 1179 if (OpcA == OpcB) 1180 return true; 1181 1182 // Try to match a sign-extended load/store with a zero-extended load/store. 1183 bool IsValidLdStrOpc, PairIsValidLdStrOpc; 1184 unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc); 1185 assert(IsValidLdStrOpc && 1186 "Given Opc should be a Load or Store with an immediate"); 1187 // OpcA will be the first instruction in the pair. 1188 if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { 1189 Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); 1190 return true; 1191 } 1192 1193 // If the second instruction isn't even a mergable/pairable load/store, bail 1194 // out. 1195 if (!PairIsValidLdStrOpc) 1196 return false; 1197 1198 // FIXME: We don't support merging narrow stores with mixed scaled/unscaled 1199 // offsets. 1200 if (isNarrowStore(OpcA) || isNarrowStore(OpcB)) 1201 return false; 1202 1203 // Try to match an unscaled load/store with a scaled load/store. 1204 return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) && 1205 getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB); 1206 1207 // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? 1208 } 1209 1210 /// Scan the instructions looking for a load/store that can be combined with the 1211 /// current instruction into a wider equivalent or a load/store pair. 1212 MachineBasicBlock::iterator 1213 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, 1214 LdStPairFlags &Flags, unsigned Limit, 1215 bool FindNarrowMerge) { 1216 MachineBasicBlock::iterator E = I->getParent()->end(); 1217 MachineBasicBlock::iterator MBBI = I; 1218 MachineInstr &FirstMI = *I; 1219 ++MBBI; 1220 1221 bool MayLoad = FirstMI.mayLoad(); 1222 bool IsUnscaled = TII->isUnscaledLdSt(FirstMI); 1223 Register Reg = getLdStRegOp(FirstMI).getReg(); 1224 Register BaseReg = getLdStBaseOp(FirstMI).getReg(); 1225 int Offset = getLdStOffsetOp(FirstMI).getImm(); 1226 int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; 1227 bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); 1228 1229 // Track which register units have been modified and used between the first 1230 // insn (inclusive) and the second insn. 1231 ModifiedRegUnits.clear(); 1232 UsedRegUnits.clear(); 1233 1234 // Remember any instructions that read/write memory between FirstMI and MI. 1235 SmallVector<MachineInstr *, 4> MemInsns; 1236 1237 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 1238 MachineInstr &MI = *MBBI; 1239 1240 // Don't count transient instructions towards the search limit since there 1241 // may be different numbers of them if e.g. debug information is present. 1242 if (!MI.isTransient()) 1243 ++Count; 1244 1245 Flags.setSExtIdx(-1); 1246 if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) && 1247 getLdStOffsetOp(MI).isImm()) { 1248 assert(MI.mayLoadOrStore() && "Expected memory operation."); 1249 // If we've found another instruction with the same opcode, check to see 1250 // if the base and offset are compatible with our starting instruction. 1251 // These instructions all have scaled immediate operands, so we just 1252 // check for +1/-1. Make sure to check the new instruction offset is 1253 // actually an immediate and not a symbolic reference destined for 1254 // a relocation. 1255 Register MIBaseReg = getLdStBaseOp(MI).getReg(); 1256 int MIOffset = getLdStOffsetOp(MI).getImm(); 1257 bool MIIsUnscaled = TII->isUnscaledLdSt(MI); 1258 if (IsUnscaled != MIIsUnscaled) { 1259 // We're trying to pair instructions that differ in how they are scaled. 1260 // If FirstMI is scaled then scale the offset of MI accordingly. 1261 // Otherwise, do the opposite (i.e., make MI's offset unscaled). 1262 int MemSize = getMemScale(MI); 1263 if (MIIsUnscaled) { 1264 // If the unscaled offset isn't a multiple of the MemSize, we can't 1265 // pair the operations together: bail and keep looking. 1266 if (MIOffset % MemSize) { 1267 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1268 UsedRegUnits, TRI); 1269 MemInsns.push_back(&MI); 1270 continue; 1271 } 1272 MIOffset /= MemSize; 1273 } else { 1274 MIOffset *= MemSize; 1275 } 1276 } 1277 1278 if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || 1279 (Offset + OffsetStride == MIOffset))) { 1280 int MinOffset = Offset < MIOffset ? Offset : MIOffset; 1281 if (FindNarrowMerge) { 1282 // If the alignment requirements of the scaled wide load/store 1283 // instruction can't express the offset of the scaled narrow input, 1284 // bail and keep looking. For promotable zero stores, allow only when 1285 // the stored value is the same (i.e., WZR). 1286 if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) || 1287 (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) { 1288 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1289 UsedRegUnits, TRI); 1290 MemInsns.push_back(&MI); 1291 continue; 1292 } 1293 } else { 1294 // Pairwise instructions have a 7-bit signed offset field. Single 1295 // insns have a 12-bit unsigned offset field. If the resultant 1296 // immediate offset of merging these instructions is out of range for 1297 // a pairwise instruction, bail and keep looking. 1298 if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) { 1299 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1300 UsedRegUnits, TRI); 1301 MemInsns.push_back(&MI); 1302 continue; 1303 } 1304 // If the alignment requirements of the paired (scaled) instruction 1305 // can't express the offset of the unscaled input, bail and keep 1306 // looking. 1307 if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { 1308 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, 1309 UsedRegUnits, TRI); 1310 MemInsns.push_back(&MI); 1311 continue; 1312 } 1313 } 1314 // If the destination register of the loads is the same register, bail 1315 // and keep looking. A load-pair instruction with both destination 1316 // registers the same is UNPREDICTABLE and will result in an exception. 1317 if (MayLoad && Reg == getLdStRegOp(MI).getReg()) { 1318 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, 1319 TRI); 1320 MemInsns.push_back(&MI); 1321 continue; 1322 } 1323 1324 // If the Rt of the second instruction was not modified or used between 1325 // the two instructions and none of the instructions between the second 1326 // and first alias with the second, we can combine the second into the 1327 // first. 1328 if (ModifiedRegUnits.available(getLdStRegOp(MI).getReg()) && 1329 !(MI.mayLoad() && 1330 !UsedRegUnits.available(getLdStRegOp(MI).getReg())) && 1331 !mayAlias(MI, MemInsns, AA)) { 1332 Flags.setMergeForward(false); 1333 return MBBI; 1334 } 1335 1336 // Likewise, if the Rt of the first instruction is not modified or used 1337 // between the two instructions and none of the instructions between the 1338 // first and the second alias with the first, we can combine the first 1339 // into the second. 1340 if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) && 1341 !(MayLoad && 1342 !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) && 1343 !mayAlias(FirstMI, MemInsns, AA)) { 1344 Flags.setMergeForward(true); 1345 return MBBI; 1346 } 1347 // Unable to combine these instructions due to interference in between. 1348 // Keep looking. 1349 } 1350 } 1351 1352 // If the instruction wasn't a matching load or store. Stop searching if we 1353 // encounter a call instruction that might modify memory. 1354 if (MI.isCall()) 1355 return E; 1356 1357 // Update modified / uses register units. 1358 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1359 1360 // Otherwise, if the base register is modified, we have no match, so 1361 // return early. 1362 if (!ModifiedRegUnits.available(BaseReg)) 1363 return E; 1364 1365 // Update list of instructions that read/write memory. 1366 if (MI.mayLoadOrStore()) 1367 MemInsns.push_back(&MI); 1368 } 1369 return E; 1370 } 1371 1372 MachineBasicBlock::iterator 1373 AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, 1374 MachineBasicBlock::iterator Update, 1375 bool IsPreIdx) { 1376 assert((Update->getOpcode() == AArch64::ADDXri || 1377 Update->getOpcode() == AArch64::SUBXri) && 1378 "Unexpected base register update instruction to merge!"); 1379 MachineBasicBlock::iterator NextI = I; 1380 // Return the instruction following the merged instruction, which is 1381 // the instruction following our unmerged load. Unless that's the add/sub 1382 // instruction we're merging, in which case it's the one after that. 1383 if (++NextI == Update) 1384 ++NextI; 1385 1386 int Value = Update->getOperand(2).getImm(); 1387 assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && 1388 "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); 1389 if (Update->getOpcode() == AArch64::SUBXri) 1390 Value = -Value; 1391 1392 unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) 1393 : getPostIndexedOpcode(I->getOpcode()); 1394 MachineInstrBuilder MIB; 1395 int Scale, MinOffset, MaxOffset; 1396 getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); 1397 if (!isPairedLdSt(*I)) { 1398 // Non-paired instruction. 1399 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1400 .add(getLdStRegOp(*Update)) 1401 .add(getLdStRegOp(*I)) 1402 .add(getLdStBaseOp(*I)) 1403 .addImm(Value / Scale) 1404 .setMemRefs(I->memoperands()) 1405 .setMIFlags(I->mergeFlagsWith(*Update)); 1406 } else { 1407 // Paired instruction. 1408 MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) 1409 .add(getLdStRegOp(*Update)) 1410 .add(getLdStRegOp(*I, 0)) 1411 .add(getLdStRegOp(*I, 1)) 1412 .add(getLdStBaseOp(*I)) 1413 .addImm(Value / Scale) 1414 .setMemRefs(I->memoperands()) 1415 .setMIFlags(I->mergeFlagsWith(*Update)); 1416 } 1417 (void)MIB; 1418 1419 if (IsPreIdx) { 1420 ++NumPreFolded; 1421 LLVM_DEBUG(dbgs() << "Creating pre-indexed load/store."); 1422 } else { 1423 ++NumPostFolded; 1424 LLVM_DEBUG(dbgs() << "Creating post-indexed load/store."); 1425 } 1426 LLVM_DEBUG(dbgs() << " Replacing instructions:\n "); 1427 LLVM_DEBUG(I->print(dbgs())); 1428 LLVM_DEBUG(dbgs() << " "); 1429 LLVM_DEBUG(Update->print(dbgs())); 1430 LLVM_DEBUG(dbgs() << " with instruction:\n "); 1431 LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs())); 1432 LLVM_DEBUG(dbgs() << "\n"); 1433 1434 // Erase the old instructions for the block. 1435 I->eraseFromParent(); 1436 Update->eraseFromParent(); 1437 1438 return NextI; 1439 } 1440 1441 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, 1442 MachineInstr &MI, 1443 unsigned BaseReg, int Offset) { 1444 switch (MI.getOpcode()) { 1445 default: 1446 break; 1447 case AArch64::SUBXri: 1448 case AArch64::ADDXri: 1449 // Make sure it's a vanilla immediate operand, not a relocation or 1450 // anything else we can't handle. 1451 if (!MI.getOperand(2).isImm()) 1452 break; 1453 // Watch out for 1 << 12 shifted value. 1454 if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm())) 1455 break; 1456 1457 // The update instruction source and destination register must be the 1458 // same as the load/store base register. 1459 if (MI.getOperand(0).getReg() != BaseReg || 1460 MI.getOperand(1).getReg() != BaseReg) 1461 break; 1462 1463 int UpdateOffset = MI.getOperand(2).getImm(); 1464 if (MI.getOpcode() == AArch64::SUBXri) 1465 UpdateOffset = -UpdateOffset; 1466 1467 // The immediate must be a multiple of the scaling factor of the pre/post 1468 // indexed instruction. 1469 int Scale, MinOffset, MaxOffset; 1470 getPrePostIndexedMemOpInfo(MemMI, Scale, MinOffset, MaxOffset); 1471 if (UpdateOffset % Scale != 0) 1472 break; 1473 1474 // Scaled offset must fit in the instruction immediate. 1475 int ScaledOffset = UpdateOffset / Scale; 1476 if (ScaledOffset > MaxOffset || ScaledOffset < MinOffset) 1477 break; 1478 1479 // If we have a non-zero Offset, we check that it matches the amount 1480 // we're adding to the register. 1481 if (!Offset || Offset == UpdateOffset) 1482 return true; 1483 break; 1484 } 1485 return false; 1486 } 1487 1488 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( 1489 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) { 1490 MachineBasicBlock::iterator E = I->getParent()->end(); 1491 MachineInstr &MemMI = *I; 1492 MachineBasicBlock::iterator MBBI = I; 1493 1494 Register BaseReg = getLdStBaseOp(MemMI).getReg(); 1495 int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); 1496 1497 // Scan forward looking for post-index opportunities. Updating instructions 1498 // can't be formed if the memory instruction doesn't have the offset we're 1499 // looking for. 1500 if (MIUnscaledOffset != UnscaledOffset) 1501 return E; 1502 1503 // If the base register overlaps a source/destination register, we can't 1504 // merge the update. This does not apply to tag store instructions which 1505 // ignore the address part of the source register. 1506 // This does not apply to STGPi as well, which does not have unpredictable 1507 // behavior in this case unlike normal stores, and always performs writeback 1508 // after reading the source register value. 1509 if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { 1510 bool IsPairedInsn = isPairedLdSt(MemMI); 1511 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1512 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 1513 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1514 return E; 1515 } 1516 } 1517 1518 // Track which register units have been modified and used between the first 1519 // insn (inclusive) and the second insn. 1520 ModifiedRegUnits.clear(); 1521 UsedRegUnits.clear(); 1522 ++MBBI; 1523 for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { 1524 MachineInstr &MI = *MBBI; 1525 1526 // Don't count transient instructions towards the search limit since there 1527 // may be different numbers of them if e.g. debug information is present. 1528 if (!MI.isTransient()) 1529 ++Count; 1530 1531 // If we found a match, return it. 1532 if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) 1533 return MBBI; 1534 1535 // Update the status of what the instruction clobbered and used. 1536 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1537 1538 // Otherwise, if the base register is used or modified, we have no match, so 1539 // return early. 1540 if (!ModifiedRegUnits.available(BaseReg) || 1541 !UsedRegUnits.available(BaseReg)) 1542 return E; 1543 } 1544 return E; 1545 } 1546 1547 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( 1548 MachineBasicBlock::iterator I, unsigned Limit) { 1549 MachineBasicBlock::iterator B = I->getParent()->begin(); 1550 MachineBasicBlock::iterator E = I->getParent()->end(); 1551 MachineInstr &MemMI = *I; 1552 MachineBasicBlock::iterator MBBI = I; 1553 1554 Register BaseReg = getLdStBaseOp(MemMI).getReg(); 1555 int Offset = getLdStOffsetOp(MemMI).getImm(); 1556 1557 // If the load/store is the first instruction in the block, there's obviously 1558 // not any matching update. Ditto if the memory offset isn't zero. 1559 if (MBBI == B || Offset != 0) 1560 return E; 1561 // If the base register overlaps a destination register, we can't 1562 // merge the update. 1563 if (!isTagStore(MemMI)) { 1564 bool IsPairedInsn = isPairedLdSt(MemMI); 1565 for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { 1566 Register DestReg = getLdStRegOp(MemMI, i).getReg(); 1567 if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) 1568 return E; 1569 } 1570 } 1571 1572 // Track which register units have been modified and used between the first 1573 // insn (inclusive) and the second insn. 1574 ModifiedRegUnits.clear(); 1575 UsedRegUnits.clear(); 1576 unsigned Count = 0; 1577 do { 1578 --MBBI; 1579 MachineInstr &MI = *MBBI; 1580 1581 // Don't count transient instructions towards the search limit since there 1582 // may be different numbers of them if e.g. debug information is present. 1583 if (!MI.isTransient()) 1584 ++Count; 1585 1586 // If we found a match, return it. 1587 if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) 1588 return MBBI; 1589 1590 // Update the status of what the instruction clobbered and used. 1591 LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); 1592 1593 // Otherwise, if the base register is used or modified, we have no match, so 1594 // return early. 1595 if (!ModifiedRegUnits.available(BaseReg) || 1596 !UsedRegUnits.available(BaseReg)) 1597 return E; 1598 } while (MBBI != B && Count < Limit); 1599 return E; 1600 } 1601 1602 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( 1603 MachineBasicBlock::iterator &MBBI) { 1604 MachineInstr &MI = *MBBI; 1605 // If this is a volatile load, don't mess with it. 1606 if (MI.hasOrderedMemoryRef()) 1607 return false; 1608 1609 // Make sure this is a reg+imm. 1610 // FIXME: It is possible to extend it to handle reg+reg cases. 1611 if (!getLdStOffsetOp(MI).isImm()) 1612 return false; 1613 1614 // Look backward up to LdStLimit instructions. 1615 MachineBasicBlock::iterator StoreI; 1616 if (findMatchingStore(MBBI, LdStLimit, StoreI)) { 1617 ++NumLoadsFromStoresPromoted; 1618 // Promote the load. Keeping the iterator straight is a 1619 // pain, so we let the merge routine tell us what the next instruction 1620 // is after it's done mucking about. 1621 MBBI = promoteLoadFromStore(MBBI, StoreI); 1622 return true; 1623 } 1624 return false; 1625 } 1626 1627 // Merge adjacent zero stores into a wider store. 1628 bool AArch64LoadStoreOpt::tryToMergeZeroStInst( 1629 MachineBasicBlock::iterator &MBBI) { 1630 assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store."); 1631 MachineInstr &MI = *MBBI; 1632 MachineBasicBlock::iterator E = MI.getParent()->end(); 1633 1634 if (!TII->isCandidateToMergeOrPair(MI)) 1635 return false; 1636 1637 // Look ahead up to LdStLimit instructions for a mergable instruction. 1638 LdStPairFlags Flags; 1639 MachineBasicBlock::iterator MergeMI = 1640 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true); 1641 if (MergeMI != E) { 1642 ++NumZeroStoresPromoted; 1643 1644 // Keeping the iterator straight is a pain, so we let the merge routine tell 1645 // us what the next instruction is after it's done mucking about. 1646 MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags); 1647 return true; 1648 } 1649 return false; 1650 } 1651 1652 // Find loads and stores that can be merged into a single load or store pair 1653 // instruction. 1654 bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { 1655 MachineInstr &MI = *MBBI; 1656 MachineBasicBlock::iterator E = MI.getParent()->end(); 1657 1658 if (!TII->isCandidateToMergeOrPair(MI)) 1659 return false; 1660 1661 // Early exit if the offset is not possible to match. (6 bits of positive 1662 // range, plus allow an extra one in case we find a later insn that matches 1663 // with Offset-1) 1664 bool IsUnscaled = TII->isUnscaledLdSt(MI); 1665 int Offset = getLdStOffsetOp(MI).getImm(); 1666 int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; 1667 // Allow one more for offset. 1668 if (Offset > 0) 1669 Offset -= OffsetStride; 1670 if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) 1671 return false; 1672 1673 // Look ahead up to LdStLimit instructions for a pairable instruction. 1674 LdStPairFlags Flags; 1675 MachineBasicBlock::iterator Paired = 1676 findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false); 1677 if (Paired != E) { 1678 ++NumPairCreated; 1679 if (TII->isUnscaledLdSt(MI)) 1680 ++NumUnscaledPairCreated; 1681 // Keeping the iterator straight is a pain, so we let the merge routine tell 1682 // us what the next instruction is after it's done mucking about. 1683 MBBI = mergePairedInsns(MBBI, Paired, Flags); 1684 return true; 1685 } 1686 return false; 1687 } 1688 1689 bool AArch64LoadStoreOpt::tryToMergeLdStUpdate 1690 (MachineBasicBlock::iterator &MBBI) { 1691 MachineInstr &MI = *MBBI; 1692 MachineBasicBlock::iterator E = MI.getParent()->end(); 1693 MachineBasicBlock::iterator Update; 1694 1695 // Look forward to try to form a post-index instruction. For example, 1696 // ldr x0, [x20] 1697 // add x20, x20, #32 1698 // merged into: 1699 // ldr x0, [x20], #32 1700 Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); 1701 if (Update != E) { 1702 // Merge the update into the ld/st. 1703 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); 1704 return true; 1705 } 1706 1707 // Don't know how to handle unscaled pre/post-index versions below, so bail. 1708 if (TII->isUnscaledLdSt(MI.getOpcode())) 1709 return false; 1710 1711 // Look back to try to find a pre-index instruction. For example, 1712 // add x0, x0, #8 1713 // ldr x1, [x0] 1714 // merged into: 1715 // ldr x1, [x0, #8]! 1716 Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); 1717 if (Update != E) { 1718 // Merge the update into the ld/st. 1719 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1720 return true; 1721 } 1722 1723 // The immediate in the load/store is scaled by the size of the memory 1724 // operation. The immediate in the add we're looking for, 1725 // however, is not, so adjust here. 1726 int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); 1727 1728 // Look forward to try to find a pre-index instruction. For example, 1729 // ldr x1, [x0, #64] 1730 // add x0, x0, #64 1731 // merged into: 1732 // ldr x1, [x0, #64]! 1733 Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); 1734 if (Update != E) { 1735 // Merge the update into the ld/st. 1736 MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); 1737 return true; 1738 } 1739 1740 return false; 1741 } 1742 1743 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, 1744 bool EnableNarrowZeroStOpt) { 1745 bool Modified = false; 1746 // Four tranformations to do here: 1747 // 1) Find loads that directly read from stores and promote them by 1748 // replacing with mov instructions. If the store is wider than the load, 1749 // the load will be replaced with a bitfield extract. 1750 // e.g., 1751 // str w1, [x0, #4] 1752 // ldrh w2, [x0, #6] 1753 // ; becomes 1754 // str w1, [x0, #4] 1755 // lsr w2, w1, #16 1756 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1757 MBBI != E;) { 1758 if (isPromotableLoadFromStore(*MBBI) && tryToPromoteLoadFromStore(MBBI)) 1759 Modified = true; 1760 else 1761 ++MBBI; 1762 } 1763 // 2) Merge adjacent zero stores into a wider store. 1764 // e.g., 1765 // strh wzr, [x0] 1766 // strh wzr, [x0, #2] 1767 // ; becomes 1768 // str wzr, [x0] 1769 // e.g., 1770 // str wzr, [x0] 1771 // str wzr, [x0, #4] 1772 // ; becomes 1773 // str xzr, [x0] 1774 if (EnableNarrowZeroStOpt) 1775 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1776 MBBI != E;) { 1777 if (isPromotableZeroStoreInst(*MBBI) && tryToMergeZeroStInst(MBBI)) 1778 Modified = true; 1779 else 1780 ++MBBI; 1781 } 1782 // 3) Find loads and stores that can be merged into a single load or store 1783 // pair instruction. 1784 // e.g., 1785 // ldr x0, [x2] 1786 // ldr x1, [x2, #8] 1787 // ; becomes 1788 // ldp x0, x1, [x2] 1789 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1790 MBBI != E;) { 1791 if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) 1792 Modified = true; 1793 else 1794 ++MBBI; 1795 } 1796 // 4) Find base register updates that can be merged into the load or store 1797 // as a base-reg writeback. 1798 // e.g., 1799 // ldr x0, [x2] 1800 // add x2, x2, #4 1801 // ; becomes 1802 // ldr x0, [x2], #4 1803 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); 1804 MBBI != E;) { 1805 if (isMergeableLdStUpdate(*MBBI) && tryToMergeLdStUpdate(MBBI)) 1806 Modified = true; 1807 else 1808 ++MBBI; 1809 } 1810 1811 return Modified; 1812 } 1813 1814 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { 1815 if (skipFunction(Fn.getFunction())) 1816 return false; 1817 1818 Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); 1819 TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); 1820 TRI = Subtarget->getRegisterInfo(); 1821 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 1822 1823 // Resize the modified and used register unit trackers. We do this once 1824 // per function and then clear the register units each time we optimize a load 1825 // or store. 1826 ModifiedRegUnits.init(*TRI); 1827 UsedRegUnits.init(*TRI); 1828 1829 bool Modified = false; 1830 bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); 1831 for (auto &MBB : Fn) 1832 Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt); 1833 1834 return Modified; 1835 } 1836 1837 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and 1838 // stores near one another? Note: The pre-RA instruction scheduler already has 1839 // hooks to try and schedule pairable loads/stores together to improve pairing 1840 // opportunities. Thus, pre-RA pairing pass may not be worth the effort. 1841 1842 // FIXME: When pairing store instructions it's very possible for this pass to 1843 // hoist a store with a KILL marker above another use (without a KILL marker). 1844 // The resulting IR is invalid, but nothing uses the KILL markers after this 1845 // pass, so it's never caused a problem in practice. 1846 1847 /// createAArch64LoadStoreOptimizationPass - returns an instance of the 1848 /// load / store optimization pass. 1849 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { 1850 return new AArch64LoadStoreOpt(); 1851 } 1852