1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass * 223 getTargetRegisterClass(const CombineInfo &CI, 224 const CombineInfo &Paired) const; 225 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 226 227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 228 229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore, int OpName, 231 Register DestReg) const; 232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore, 234 int OpName) const; 235 236 unsigned read2Opcode(unsigned EltSize) const; 237 unsigned read2ST64Opcode(unsigned EltSize) const; 238 MachineBasicBlock::iterator 239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 240 MachineBasicBlock::iterator InsertBefore); 241 242 unsigned write2Opcode(unsigned EltSize) const; 243 unsigned write2ST64Opcode(unsigned EltSize) const; 244 MachineBasicBlock::iterator 245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 246 MachineBasicBlock::iterator InsertBefore); 247 MachineBasicBlock::iterator 248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 249 MachineBasicBlock::iterator InsertBefore); 250 MachineBasicBlock::iterator 251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 252 MachineBasicBlock::iterator InsertBefore); 253 MachineBasicBlock::iterator 254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 255 MachineBasicBlock::iterator InsertBefore); 256 MachineBasicBlock::iterator 257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 258 MachineBasicBlock::iterator InsertBefore); 259 MachineBasicBlock::iterator 260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 261 MachineBasicBlock::iterator InsertBefore); 262 MachineBasicBlock::iterator 263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 264 MachineBasicBlock::iterator InsertBefore); 265 MachineBasicBlock::iterator 266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 267 MachineBasicBlock::iterator InsertBefore); 268 MachineBasicBlock::iterator 269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 270 MachineBasicBlock::iterator InsertBefore); 271 272 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 273 int32_t NewOffset) const; 274 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 275 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 276 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 277 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 278 /// Promotes constant offset to the immediate by adjusting the base. It 279 /// tries to use a base from the nearby instructions that allows it to have 280 /// a 13bit constant offset which gets promoted to the immediate. 281 bool promoteConstantOffsetToImm(MachineInstr &CI, 282 MemInfoMap &Visited, 283 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 284 void addInstToMergeableList(const CombineInfo &CI, 285 std::list<std::list<CombineInfo> > &MergeableInsts) const; 286 287 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 288 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 289 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 290 std::list<std::list<CombineInfo>> &MergeableInsts) const; 291 292 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 293 const CombineInfo &Paired); 294 295 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 296 const CombineInfo &Paired); 297 298 public: 299 static char ID; 300 301 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 302 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 303 } 304 305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 306 bool &OptimizeListAgain); 307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 308 309 bool runOnMachineFunction(MachineFunction &MF) override; 310 311 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 312 313 void getAnalysisUsage(AnalysisUsage &AU) const override { 314 AU.setPreservesCFG(); 315 AU.addRequired<AAResultsWrapperPass>(); 316 317 MachineFunctionPass::getAnalysisUsage(AU); 318 } 319 320 MachineFunctionProperties getRequiredProperties() const override { 321 return MachineFunctionProperties() 322 .set(MachineFunctionProperties::Property::IsSSA); 323 } 324 }; 325 326 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 327 const unsigned Opc = MI.getOpcode(); 328 329 if (TII.isMUBUF(Opc)) { 330 // FIXME: Handle d16 correctly 331 return AMDGPU::getMUBUFElements(Opc); 332 } 333 if (TII.isImage(MI)) { 334 uint64_t DMaskImm = 335 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 336 return llvm::popcount(DMaskImm); 337 } 338 if (TII.isMTBUF(Opc)) { 339 return AMDGPU::getMTBUFElements(Opc); 340 } 341 342 switch (Opc) { 343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 345 case AMDGPU::S_LOAD_DWORD_IMM: 346 case AMDGPU::GLOBAL_LOAD_DWORD: 347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 348 case AMDGPU::GLOBAL_STORE_DWORD: 349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 350 case AMDGPU::FLAT_LOAD_DWORD: 351 case AMDGPU::FLAT_STORE_DWORD: 352 return 1; 353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 355 case AMDGPU::S_LOAD_DWORDX2_IMM: 356 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 357 case AMDGPU::GLOBAL_LOAD_DWORDX2: 358 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 359 case AMDGPU::GLOBAL_STORE_DWORDX2: 360 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 361 case AMDGPU::FLAT_LOAD_DWORDX2: 362 case AMDGPU::FLAT_STORE_DWORDX2: 363 return 2; 364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 365 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 366 case AMDGPU::S_LOAD_DWORDX3_IMM: 367 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 368 case AMDGPU::GLOBAL_LOAD_DWORDX3: 369 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 370 case AMDGPU::GLOBAL_STORE_DWORDX3: 371 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 372 case AMDGPU::FLAT_LOAD_DWORDX3: 373 case AMDGPU::FLAT_STORE_DWORDX3: 374 return 3; 375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 377 case AMDGPU::S_LOAD_DWORDX4_IMM: 378 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 379 case AMDGPU::GLOBAL_LOAD_DWORDX4: 380 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 381 case AMDGPU::GLOBAL_STORE_DWORDX4: 382 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 383 case AMDGPU::FLAT_LOAD_DWORDX4: 384 case AMDGPU::FLAT_STORE_DWORDX4: 385 return 4; 386 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 387 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 388 case AMDGPU::S_LOAD_DWORDX8_IMM: 389 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 390 return 8; 391 case AMDGPU::DS_READ_B32: 392 case AMDGPU::DS_READ_B32_gfx9: 393 case AMDGPU::DS_WRITE_B32: 394 case AMDGPU::DS_WRITE_B32_gfx9: 395 return 1; 396 case AMDGPU::DS_READ_B64: 397 case AMDGPU::DS_READ_B64_gfx9: 398 case AMDGPU::DS_WRITE_B64: 399 case AMDGPU::DS_WRITE_B64_gfx9: 400 return 2; 401 default: 402 return 0; 403 } 404 } 405 406 /// Maps instruction opcode to enum InstClassEnum. 407 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 408 switch (Opc) { 409 default: 410 if (TII.isMUBUF(Opc)) { 411 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 412 default: 413 return UNKNOWN; 414 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 415 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 416 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 417 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 418 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 419 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 420 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 421 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 426 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 427 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 428 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 429 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 430 return BUFFER_LOAD; 431 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 432 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 433 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 434 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 435 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 436 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 437 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 438 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 443 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 444 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 445 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 446 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 447 return BUFFER_STORE; 448 } 449 } 450 if (TII.isImage(Opc)) { 451 // Ignore instructions encoded without vaddr. 452 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 453 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 454 return UNKNOWN; 455 // Ignore BVH instructions 456 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 457 return UNKNOWN; 458 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 459 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 460 TII.isGather4(Opc)) 461 return UNKNOWN; 462 return MIMG; 463 } 464 if (TII.isMTBUF(Opc)) { 465 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 466 default: 467 return UNKNOWN; 468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 484 return TBUFFER_LOAD; 485 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 486 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 487 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 488 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 489 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 490 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 491 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 492 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 493 return TBUFFER_STORE; 494 } 495 } 496 return UNKNOWN; 497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 499 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 500 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 501 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 502 return S_BUFFER_LOAD_IMM; 503 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 504 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 505 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 506 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 507 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 508 return S_BUFFER_LOAD_SGPR_IMM; 509 case AMDGPU::S_LOAD_DWORD_IMM: 510 case AMDGPU::S_LOAD_DWORDX2_IMM: 511 case AMDGPU::S_LOAD_DWORDX3_IMM: 512 case AMDGPU::S_LOAD_DWORDX4_IMM: 513 case AMDGPU::S_LOAD_DWORDX8_IMM: 514 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 515 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 516 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 517 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 518 return S_LOAD_IMM; 519 case AMDGPU::DS_READ_B32: 520 case AMDGPU::DS_READ_B32_gfx9: 521 case AMDGPU::DS_READ_B64: 522 case AMDGPU::DS_READ_B64_gfx9: 523 return DS_READ; 524 case AMDGPU::DS_WRITE_B32: 525 case AMDGPU::DS_WRITE_B32_gfx9: 526 case AMDGPU::DS_WRITE_B64: 527 case AMDGPU::DS_WRITE_B64_gfx9: 528 return DS_WRITE; 529 case AMDGPU::GLOBAL_LOAD_DWORD: 530 case AMDGPU::GLOBAL_LOAD_DWORDX2: 531 case AMDGPU::GLOBAL_LOAD_DWORDX3: 532 case AMDGPU::GLOBAL_LOAD_DWORDX4: 533 case AMDGPU::FLAT_LOAD_DWORD: 534 case AMDGPU::FLAT_LOAD_DWORDX2: 535 case AMDGPU::FLAT_LOAD_DWORDX3: 536 case AMDGPU::FLAT_LOAD_DWORDX4: 537 return FLAT_LOAD; 538 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 539 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 540 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 541 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 542 return GLOBAL_LOAD_SADDR; 543 case AMDGPU::GLOBAL_STORE_DWORD: 544 case AMDGPU::GLOBAL_STORE_DWORDX2: 545 case AMDGPU::GLOBAL_STORE_DWORDX3: 546 case AMDGPU::GLOBAL_STORE_DWORDX4: 547 case AMDGPU::FLAT_STORE_DWORD: 548 case AMDGPU::FLAT_STORE_DWORDX2: 549 case AMDGPU::FLAT_STORE_DWORDX3: 550 case AMDGPU::FLAT_STORE_DWORDX4: 551 return FLAT_STORE; 552 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 553 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 554 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 555 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 556 return GLOBAL_STORE_SADDR; 557 } 558 } 559 560 /// Determines instruction subclass from opcode. Only instructions 561 /// of the same subclass can be merged together. The merged instruction may have 562 /// a different subclass but must have the same class. 563 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 564 switch (Opc) { 565 default: 566 if (TII.isMUBUF(Opc)) 567 return AMDGPU::getMUBUFBaseOpcode(Opc); 568 if (TII.isImage(Opc)) { 569 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 570 assert(Info); 571 return Info->BaseOpcode; 572 } 573 if (TII.isMTBUF(Opc)) 574 return AMDGPU::getMTBUFBaseOpcode(Opc); 575 return -1; 576 case AMDGPU::DS_READ_B32: 577 case AMDGPU::DS_READ_B32_gfx9: 578 case AMDGPU::DS_READ_B64: 579 case AMDGPU::DS_READ_B64_gfx9: 580 case AMDGPU::DS_WRITE_B32: 581 case AMDGPU::DS_WRITE_B32_gfx9: 582 case AMDGPU::DS_WRITE_B64: 583 case AMDGPU::DS_WRITE_B64_gfx9: 584 return Opc; 585 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 586 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 587 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 588 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 589 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 590 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 591 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 592 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 593 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 594 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 595 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 596 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 597 case AMDGPU::S_LOAD_DWORD_IMM: 598 case AMDGPU::S_LOAD_DWORDX2_IMM: 599 case AMDGPU::S_LOAD_DWORDX3_IMM: 600 case AMDGPU::S_LOAD_DWORDX4_IMM: 601 case AMDGPU::S_LOAD_DWORDX8_IMM: 602 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 603 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 604 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 605 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 606 return AMDGPU::S_LOAD_DWORD_IMM; 607 case AMDGPU::GLOBAL_LOAD_DWORD: 608 case AMDGPU::GLOBAL_LOAD_DWORDX2: 609 case AMDGPU::GLOBAL_LOAD_DWORDX3: 610 case AMDGPU::GLOBAL_LOAD_DWORDX4: 611 case AMDGPU::FLAT_LOAD_DWORD: 612 case AMDGPU::FLAT_LOAD_DWORDX2: 613 case AMDGPU::FLAT_LOAD_DWORDX3: 614 case AMDGPU::FLAT_LOAD_DWORDX4: 615 return AMDGPU::FLAT_LOAD_DWORD; 616 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 617 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 618 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 619 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 620 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 621 case AMDGPU::GLOBAL_STORE_DWORD: 622 case AMDGPU::GLOBAL_STORE_DWORDX2: 623 case AMDGPU::GLOBAL_STORE_DWORDX3: 624 case AMDGPU::GLOBAL_STORE_DWORDX4: 625 case AMDGPU::FLAT_STORE_DWORD: 626 case AMDGPU::FLAT_STORE_DWORDX2: 627 case AMDGPU::FLAT_STORE_DWORDX3: 628 case AMDGPU::FLAT_STORE_DWORDX4: 629 return AMDGPU::FLAT_STORE_DWORD; 630 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 631 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 632 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 633 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 634 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 635 } 636 } 637 638 // GLOBAL loads and stores are classified as FLAT initially. If both combined 639 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 640 // If either or both instructions are non segment specific FLAT the resulting 641 // combined operation will be FLAT, potentially promoting one of the GLOBAL 642 // operations to FLAT. 643 // For other instructions return the original unmodified class. 644 InstClassEnum 645 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 646 const CombineInfo &Paired) { 647 assert(CI.InstClass == Paired.InstClass); 648 649 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 650 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 651 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 652 653 return CI.InstClass; 654 } 655 656 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 657 AddressRegs Result; 658 659 if (TII.isMUBUF(Opc)) { 660 if (AMDGPU::getMUBUFHasVAddr(Opc)) 661 Result.VAddr = true; 662 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 663 Result.SRsrc = true; 664 if (AMDGPU::getMUBUFHasSoffset(Opc)) 665 Result.SOffset = true; 666 667 return Result; 668 } 669 670 if (TII.isImage(Opc)) { 671 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 672 if (VAddr0Idx >= 0) { 673 int RsrcName = 674 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 675 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 676 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 677 } else { 678 Result.VAddr = true; 679 } 680 Result.SRsrc = true; 681 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 682 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 683 Result.SSamp = true; 684 685 return Result; 686 } 687 if (TII.isMTBUF(Opc)) { 688 if (AMDGPU::getMTBUFHasVAddr(Opc)) 689 Result.VAddr = true; 690 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 691 Result.SRsrc = true; 692 if (AMDGPU::getMTBUFHasSoffset(Opc)) 693 Result.SOffset = true; 694 695 return Result; 696 } 697 698 switch (Opc) { 699 default: 700 return Result; 701 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 702 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 703 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 704 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 705 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 706 Result.SOffset = true; 707 [[fallthrough]]; 708 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 709 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 710 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 711 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 712 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 713 case AMDGPU::S_LOAD_DWORD_IMM: 714 case AMDGPU::S_LOAD_DWORDX2_IMM: 715 case AMDGPU::S_LOAD_DWORDX3_IMM: 716 case AMDGPU::S_LOAD_DWORDX4_IMM: 717 case AMDGPU::S_LOAD_DWORDX8_IMM: 718 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 719 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 720 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 721 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 722 Result.SBase = true; 723 return Result; 724 case AMDGPU::DS_READ_B32: 725 case AMDGPU::DS_READ_B64: 726 case AMDGPU::DS_READ_B32_gfx9: 727 case AMDGPU::DS_READ_B64_gfx9: 728 case AMDGPU::DS_WRITE_B32: 729 case AMDGPU::DS_WRITE_B64: 730 case AMDGPU::DS_WRITE_B32_gfx9: 731 case AMDGPU::DS_WRITE_B64_gfx9: 732 Result.Addr = true; 733 return Result; 734 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 735 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 736 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 737 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 738 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 739 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 740 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 741 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 742 Result.SAddr = true; 743 [[fallthrough]]; 744 case AMDGPU::GLOBAL_LOAD_DWORD: 745 case AMDGPU::GLOBAL_LOAD_DWORDX2: 746 case AMDGPU::GLOBAL_LOAD_DWORDX3: 747 case AMDGPU::GLOBAL_LOAD_DWORDX4: 748 case AMDGPU::GLOBAL_STORE_DWORD: 749 case AMDGPU::GLOBAL_STORE_DWORDX2: 750 case AMDGPU::GLOBAL_STORE_DWORDX3: 751 case AMDGPU::GLOBAL_STORE_DWORDX4: 752 case AMDGPU::FLAT_LOAD_DWORD: 753 case AMDGPU::FLAT_LOAD_DWORDX2: 754 case AMDGPU::FLAT_LOAD_DWORDX3: 755 case AMDGPU::FLAT_LOAD_DWORDX4: 756 case AMDGPU::FLAT_STORE_DWORD: 757 case AMDGPU::FLAT_STORE_DWORDX2: 758 case AMDGPU::FLAT_STORE_DWORDX3: 759 case AMDGPU::FLAT_STORE_DWORDX4: 760 Result.VAddr = true; 761 return Result; 762 } 763 } 764 765 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 766 const SILoadStoreOptimizer &LSO) { 767 I = MI; 768 unsigned Opc = MI->getOpcode(); 769 InstClass = getInstClass(Opc, *LSO.TII); 770 771 if (InstClass == UNKNOWN) 772 return; 773 774 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 775 776 switch (InstClass) { 777 case DS_READ: 778 EltSize = 779 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 780 : 4; 781 break; 782 case DS_WRITE: 783 EltSize = 784 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 785 : 4; 786 break; 787 case S_BUFFER_LOAD_IMM: 788 case S_BUFFER_LOAD_SGPR_IMM: 789 case S_LOAD_IMM: 790 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 791 break; 792 default: 793 EltSize = 4; 794 break; 795 } 796 797 if (InstClass == MIMG) { 798 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 799 // Offset is not considered for MIMG instructions. 800 Offset = 0; 801 } else { 802 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 803 Offset = I->getOperand(OffsetIdx).getImm(); 804 } 805 806 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 807 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 808 809 Width = getOpcodeWidth(*I, *LSO.TII); 810 811 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 812 Offset &= 0xffff; 813 } else if (InstClass != MIMG) { 814 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 815 } 816 817 AddressRegs Regs = getRegs(Opc, *LSO.TII); 818 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 819 820 NumAddresses = 0; 821 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 822 AddrIdx[NumAddresses++] = 823 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 824 if (Regs.Addr) 825 AddrIdx[NumAddresses++] = 826 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 827 if (Regs.SBase) 828 AddrIdx[NumAddresses++] = 829 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 830 if (Regs.SRsrc) 831 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 832 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 833 if (Regs.SOffset) 834 AddrIdx[NumAddresses++] = 835 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 836 if (Regs.SAddr) 837 AddrIdx[NumAddresses++] = 838 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 839 if (Regs.VAddr) 840 AddrIdx[NumAddresses++] = 841 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 842 if (Regs.SSamp) 843 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 844 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 845 assert(NumAddresses <= MaxAddressRegs); 846 847 for (unsigned J = 0; J < NumAddresses; J++) 848 AddrReg[J] = &I->getOperand(AddrIdx[J]); 849 } 850 851 } // end anonymous namespace. 852 853 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 854 "SI Load Store Optimizer", false, false) 855 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 856 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 857 false, false) 858 859 char SILoadStoreOptimizer::ID = 0; 860 861 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 862 863 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 864 return new SILoadStoreOptimizer(); 865 } 866 867 static void addDefsUsesToList(const MachineInstr &MI, 868 DenseSet<Register> &RegDefs, 869 DenseSet<Register> &RegUses) { 870 for (const auto &Op : MI.operands()) { 871 if (!Op.isReg()) 872 continue; 873 if (Op.isDef()) 874 RegDefs.insert(Op.getReg()); 875 if (Op.readsReg()) 876 RegUses.insert(Op.getReg()); 877 } 878 } 879 880 bool SILoadStoreOptimizer::canSwapInstructions( 881 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 882 const MachineInstr &A, const MachineInstr &B) const { 883 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 884 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 885 return false; 886 for (const auto &BOp : B.operands()) { 887 if (!BOp.isReg()) 888 continue; 889 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 890 return false; 891 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 892 return false; 893 } 894 return true; 895 } 896 897 // Given that \p CI and \p Paired are adjacent memory operations produce a new 898 // MMO for the combined operation with a new access size. 899 MachineMemOperand * 900 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 901 const CombineInfo &Paired) { 902 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 903 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 904 905 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 906 907 // A base pointer for the combined operation is the same as the leading 908 // operation's pointer. 909 if (Paired < CI) 910 std::swap(MMOa, MMOb); 911 912 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 913 // If merging FLAT and GLOBAL set address space to FLAT. 914 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 915 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 916 917 MachineFunction *MF = CI.I->getMF(); 918 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 919 } 920 921 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 922 const SIInstrInfo &TII, 923 const CombineInfo &Paired) { 924 assert(CI.InstClass == MIMG); 925 926 // Ignore instructions with tfe/lwe set. 927 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 928 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 929 930 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 931 return false; 932 933 // Check other optional immediate operands for equality. 934 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 935 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 936 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 937 938 for (auto op : OperandsToMatch) { 939 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 940 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 941 return false; 942 if (Idx != -1 && 943 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 944 return false; 945 } 946 947 // Check DMask for overlaps. 948 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 949 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 950 951 if (!MaxMask) 952 return false; 953 954 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 955 if ((1u << AllowedBitsForMin) <= MinMask) 956 return false; 957 958 return true; 959 } 960 961 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 962 unsigned ComponentCount, 963 const GCNSubtarget &STI) { 964 if (ComponentCount > 4) 965 return 0; 966 967 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 968 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 969 if (!OldFormatInfo) 970 return 0; 971 972 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 973 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 974 ComponentCount, 975 OldFormatInfo->NumFormat, STI); 976 977 if (!NewFormatInfo) 978 return 0; 979 980 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 981 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 982 983 return NewFormatInfo->Format; 984 } 985 986 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 987 // highest power of two. Note that the result is well defined for all inputs 988 // including corner cases like: 989 // - if Lo == Hi, return that value 990 // - if Lo == 0, return 0 (even though the "- 1" below underflows 991 // - if Lo > Hi, return 0 (as if the range wrapped around) 992 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 993 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 994 } 995 996 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 997 const GCNSubtarget &STI, 998 CombineInfo &Paired, 999 bool Modify) { 1000 assert(CI.InstClass != MIMG); 1001 1002 // XXX - Would the same offset be OK? Is there any reason this would happen or 1003 // be useful? 1004 if (CI.Offset == Paired.Offset) 1005 return false; 1006 1007 // This won't be valid if the offset isn't aligned. 1008 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 1009 return false; 1010 1011 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 1012 1013 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 1014 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 1015 if (!Info0) 1016 return false; 1017 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1018 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1019 if (!Info1) 1020 return false; 1021 1022 if (Info0->BitsPerComp != Info1->BitsPerComp || 1023 Info0->NumFormat != Info1->NumFormat) 1024 return false; 1025 1026 // TODO: Should be possible to support more formats, but if format loads 1027 // are not dword-aligned, the merged load might not be valid. 1028 if (Info0->BitsPerComp != 32) 1029 return false; 1030 1031 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1032 return false; 1033 } 1034 1035 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1036 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1037 CI.UseST64 = false; 1038 CI.BaseOff = 0; 1039 1040 // Handle all non-DS instructions. 1041 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1042 if (EltOffset0 + CI.Width != EltOffset1 && 1043 EltOffset1 + Paired.Width != EltOffset0) 1044 return false; 1045 if (CI.CPol != Paired.CPol) 1046 return false; 1047 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1048 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1049 // Reject cases like: 1050 // dword + dwordx2 -> dwordx3 1051 // dword + dwordx3 -> dwordx4 1052 // If we tried to combine these cases, we would fail to extract a subreg 1053 // for the result of the second load due to SGPR alignment requirements. 1054 if (CI.Width != Paired.Width && 1055 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1056 return false; 1057 } 1058 return true; 1059 } 1060 1061 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1062 // the stride 64 versions. 1063 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1064 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1065 if (Modify) { 1066 CI.Offset = EltOffset0 / 64; 1067 Paired.Offset = EltOffset1 / 64; 1068 CI.UseST64 = true; 1069 } 1070 return true; 1071 } 1072 1073 // Check if the new offsets fit in the reduced 8-bit range. 1074 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1075 if (Modify) { 1076 CI.Offset = EltOffset0; 1077 Paired.Offset = EltOffset1; 1078 } 1079 return true; 1080 } 1081 1082 // Try to shift base address to decrease offsets. 1083 uint32_t Min = std::min(EltOffset0, EltOffset1); 1084 uint32_t Max = std::max(EltOffset0, EltOffset1); 1085 1086 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1087 if (((Max - Min) & ~Mask) == 0) { 1088 if (Modify) { 1089 // From the range of values we could use for BaseOff, choose the one that 1090 // is aligned to the highest power of two, to maximise the chance that 1091 // the same offset can be reused for other load/store pairs. 1092 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1093 // Copy the low bits of the offsets, so that when we adjust them by 1094 // subtracting BaseOff they will be multiples of 64. 1095 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1096 CI.BaseOff = BaseOff * CI.EltSize; 1097 CI.Offset = (EltOffset0 - BaseOff) / 64; 1098 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1099 CI.UseST64 = true; 1100 } 1101 return true; 1102 } 1103 1104 if (isUInt<8>(Max - Min)) { 1105 if (Modify) { 1106 // From the range of values we could use for BaseOff, choose the one that 1107 // is aligned to the highest power of two, to maximise the chance that 1108 // the same offset can be reused for other load/store pairs. 1109 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1110 CI.BaseOff = BaseOff * CI.EltSize; 1111 CI.Offset = EltOffset0 - BaseOff; 1112 Paired.Offset = EltOffset1 - BaseOff; 1113 } 1114 return true; 1115 } 1116 1117 return false; 1118 } 1119 1120 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1121 const CombineInfo &CI, 1122 const CombineInfo &Paired) { 1123 const unsigned Width = (CI.Width + Paired.Width); 1124 switch (CI.InstClass) { 1125 default: 1126 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1127 case S_BUFFER_LOAD_IMM: 1128 case S_BUFFER_LOAD_SGPR_IMM: 1129 case S_LOAD_IMM: 1130 switch (Width) { 1131 default: 1132 return false; 1133 case 2: 1134 case 4: 1135 case 8: 1136 return true; 1137 case 3: 1138 return STM.hasScalarDwordx3Loads(); 1139 } 1140 } 1141 } 1142 1143 const TargetRegisterClass * 1144 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1145 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1146 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1147 } 1148 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1149 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1150 } 1151 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1152 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1153 } 1154 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1155 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1156 } 1157 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1158 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1159 } 1160 return nullptr; 1161 } 1162 1163 /// This function assumes that CI comes before Paired in a basic block. Return 1164 /// an insertion point for the merged instruction or nullptr on failure. 1165 SILoadStoreOptimizer::CombineInfo * 1166 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1167 CombineInfo &Paired) { 1168 // If another instruction has already been merged into CI, it may now be a 1169 // type that we can't do any further merging into. 1170 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1171 return nullptr; 1172 assert(CI.InstClass == Paired.InstClass); 1173 1174 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1175 getInstSubclass(Paired.I->getOpcode(), *TII)) 1176 return nullptr; 1177 1178 // Check both offsets (or masks for MIMG) can be combined and fit in the 1179 // reduced range. 1180 if (CI.InstClass == MIMG) { 1181 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1182 return nullptr; 1183 } else { 1184 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1185 return nullptr; 1186 } 1187 1188 DenseSet<Register> RegDefs; 1189 DenseSet<Register> RegUses; 1190 CombineInfo *Where; 1191 if (CI.I->mayLoad()) { 1192 // Try to hoist Paired up to CI. 1193 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1194 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1195 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1196 return nullptr; 1197 } 1198 Where = &CI; 1199 } else { 1200 // Try to sink CI down to Paired. 1201 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1202 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1203 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1204 return nullptr; 1205 } 1206 Where = &Paired; 1207 } 1208 1209 // Call offsetsCanBeCombined with modify = true so that the offsets are 1210 // correct for the new instruction. This should return true, because 1211 // this function should only be called on CombineInfo objects that 1212 // have already been confirmed to be mergeable. 1213 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1214 offsetsCanBeCombined(CI, *STM, Paired, true); 1215 return Where; 1216 } 1217 1218 // Copy the merged load result from DestReg to the original dest regs of CI and 1219 // Paired. 1220 void SILoadStoreOptimizer::copyToDestRegs( 1221 CombineInfo &CI, CombineInfo &Paired, 1222 MachineBasicBlock::iterator InsertBefore, int OpName, 1223 Register DestReg) const { 1224 MachineBasicBlock *MBB = CI.I->getParent(); 1225 DebugLoc DL = CI.I->getDebugLoc(); 1226 1227 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1228 1229 // Copy to the old destination registers. 1230 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1231 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1232 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1233 1234 // The constrained sload instructions in S_LOAD_IMM class will have 1235 // `early-clobber` flag in the dst operand. Remove the flag before using the 1236 // MOs in copies. 1237 Dest0->setIsEarlyClobber(false); 1238 Dest1->setIsEarlyClobber(false); 1239 1240 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1241 .add(*Dest0) // Copy to same destination including flags and sub reg. 1242 .addReg(DestReg, 0, SubRegIdx0); 1243 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1244 .add(*Dest1) 1245 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1246 } 1247 1248 // Return a register for the source of the merged store after copying the 1249 // original source regs of CI and Paired into it. 1250 Register 1251 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1252 MachineBasicBlock::iterator InsertBefore, 1253 int OpName) const { 1254 MachineBasicBlock *MBB = CI.I->getParent(); 1255 DebugLoc DL = CI.I->getDebugLoc(); 1256 1257 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1258 1259 // Copy to the new source register. 1260 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1261 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1262 1263 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1264 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1265 1266 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1267 .add(*Src0) 1268 .addImm(SubRegIdx0) 1269 .add(*Src1) 1270 .addImm(SubRegIdx1); 1271 1272 return SrcReg; 1273 } 1274 1275 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1276 if (STM->ldsRequiresM0Init()) 1277 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1278 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1279 } 1280 1281 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1282 if (STM->ldsRequiresM0Init()) 1283 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1284 1285 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1286 : AMDGPU::DS_READ2ST64_B64_gfx9; 1287 } 1288 1289 MachineBasicBlock::iterator 1290 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1291 MachineBasicBlock::iterator InsertBefore) { 1292 MachineBasicBlock *MBB = CI.I->getParent(); 1293 1294 // Be careful, since the addresses could be subregisters themselves in weird 1295 // cases, like vectors of pointers. 1296 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1297 1298 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1299 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 1300 unsigned Opc = 1301 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1302 1303 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1304 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1305 1306 const MCInstrDesc &Read2Desc = TII->get(Opc); 1307 1308 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1309 Register DestReg = MRI->createVirtualRegister(SuperRC); 1310 1311 DebugLoc DL = CI.I->getDebugLoc(); 1312 1313 Register BaseReg = AddrReg->getReg(); 1314 unsigned BaseSubReg = AddrReg->getSubReg(); 1315 unsigned BaseRegFlags = 0; 1316 if (CI.BaseOff) { 1317 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1318 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1319 .addImm(CI.BaseOff); 1320 1321 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1322 BaseRegFlags = RegState::Kill; 1323 1324 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1325 .addReg(ImmReg) 1326 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1327 .addImm(0); // clamp bit 1328 BaseSubReg = 0; 1329 } 1330 1331 MachineInstrBuilder Read2 = 1332 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1333 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1334 .addImm(NewOffset0) // offset0 1335 .addImm(NewOffset1) // offset1 1336 .addImm(0) // gds 1337 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1338 1339 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1340 1341 CI.I->eraseFromParent(); 1342 Paired.I->eraseFromParent(); 1343 1344 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1345 return Read2; 1346 } 1347 1348 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1349 if (STM->ldsRequiresM0Init()) 1350 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1351 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1352 : AMDGPU::DS_WRITE2_B64_gfx9; 1353 } 1354 1355 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1356 if (STM->ldsRequiresM0Init()) 1357 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1358 : AMDGPU::DS_WRITE2ST64_B64; 1359 1360 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1361 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1362 } 1363 1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1365 CombineInfo &CI, CombineInfo &Paired, 1366 MachineBasicBlock::iterator InsertBefore) { 1367 MachineBasicBlock *MBB = CI.I->getParent(); 1368 1369 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1370 // sure we preserve the subregister index and any register flags set on them. 1371 const MachineOperand *AddrReg = 1372 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1373 const MachineOperand *Data0 = 1374 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1375 const MachineOperand *Data1 = 1376 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1377 1378 unsigned NewOffset0 = CI.Offset; 1379 unsigned NewOffset1 = Paired.Offset; 1380 unsigned Opc = 1381 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1382 1383 if (NewOffset0 > NewOffset1) { 1384 // Canonicalize the merged instruction so the smaller offset comes first. 1385 std::swap(NewOffset0, NewOffset1); 1386 std::swap(Data0, Data1); 1387 } 1388 1389 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1390 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1391 1392 const MCInstrDesc &Write2Desc = TII->get(Opc); 1393 DebugLoc DL = CI.I->getDebugLoc(); 1394 1395 Register BaseReg = AddrReg->getReg(); 1396 unsigned BaseSubReg = AddrReg->getSubReg(); 1397 unsigned BaseRegFlags = 0; 1398 if (CI.BaseOff) { 1399 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1400 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1401 .addImm(CI.BaseOff); 1402 1403 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1404 BaseRegFlags = RegState::Kill; 1405 1406 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1407 .addReg(ImmReg) 1408 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1409 .addImm(0); // clamp bit 1410 BaseSubReg = 0; 1411 } 1412 1413 MachineInstrBuilder Write2 = 1414 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1415 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1416 .add(*Data0) // data0 1417 .add(*Data1) // data1 1418 .addImm(NewOffset0) // offset0 1419 .addImm(NewOffset1) // offset1 1420 .addImm(0) // gds 1421 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1422 1423 CI.I->eraseFromParent(); 1424 Paired.I->eraseFromParent(); 1425 1426 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1427 return Write2; 1428 } 1429 1430 MachineBasicBlock::iterator 1431 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1432 MachineBasicBlock::iterator InsertBefore) { 1433 MachineBasicBlock *MBB = CI.I->getParent(); 1434 DebugLoc DL = CI.I->getDebugLoc(); 1435 const unsigned Opcode = getNewOpcode(CI, Paired); 1436 1437 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1438 1439 Register DestReg = MRI->createVirtualRegister(SuperRC); 1440 unsigned MergedDMask = CI.DMask | Paired.DMask; 1441 unsigned DMaskIdx = 1442 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1443 1444 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1445 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1446 if (I == DMaskIdx) 1447 MIB.addImm(MergedDMask); 1448 else 1449 MIB.add((*CI.I).getOperand(I)); 1450 } 1451 1452 // It shouldn't be possible to get this far if the two instructions 1453 // don't have a single memoperand, because MachineInstr::mayAlias() 1454 // will return true if this is the case. 1455 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1456 1457 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1458 1459 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1460 1461 CI.I->eraseFromParent(); 1462 Paired.I->eraseFromParent(); 1463 return New; 1464 } 1465 1466 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1467 CombineInfo &CI, CombineInfo &Paired, 1468 MachineBasicBlock::iterator InsertBefore) { 1469 MachineBasicBlock *MBB = CI.I->getParent(); 1470 DebugLoc DL = CI.I->getDebugLoc(); 1471 const unsigned Opcode = getNewOpcode(CI, Paired); 1472 1473 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1474 1475 Register DestReg = MRI->createVirtualRegister(SuperRC); 1476 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1477 1478 // It shouldn't be possible to get this far if the two instructions 1479 // don't have a single memoperand, because MachineInstr::mayAlias() 1480 // will return true if this is the case. 1481 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1482 1483 MachineInstrBuilder New = 1484 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1485 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1486 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1487 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1488 New.addImm(MergedOffset); 1489 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1490 1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 1492 1493 CI.I->eraseFromParent(); 1494 Paired.I->eraseFromParent(); 1495 return New; 1496 } 1497 1498 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1499 CombineInfo &CI, CombineInfo &Paired, 1500 MachineBasicBlock::iterator InsertBefore) { 1501 MachineBasicBlock *MBB = CI.I->getParent(); 1502 DebugLoc DL = CI.I->getDebugLoc(); 1503 1504 const unsigned Opcode = getNewOpcode(CI, Paired); 1505 1506 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1507 1508 // Copy to the new source register. 1509 Register DestReg = MRI->createVirtualRegister(SuperRC); 1510 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1511 1512 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1513 1514 AddressRegs Regs = getRegs(Opcode, *TII); 1515 1516 if (Regs.VAddr) 1517 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1518 1519 // It shouldn't be possible to get this far if the two instructions 1520 // don't have a single memoperand, because MachineInstr::mayAlias() 1521 // will return true if this is the case. 1522 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1523 1524 MachineInstr *New = 1525 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1526 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1527 .addImm(MergedOffset) // offset 1528 .addImm(CI.CPol) // cpol 1529 .addImm(0) // swz 1530 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1531 1532 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1533 1534 CI.I->eraseFromParent(); 1535 Paired.I->eraseFromParent(); 1536 return New; 1537 } 1538 1539 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1540 CombineInfo &CI, CombineInfo &Paired, 1541 MachineBasicBlock::iterator InsertBefore) { 1542 MachineBasicBlock *MBB = CI.I->getParent(); 1543 DebugLoc DL = CI.I->getDebugLoc(); 1544 1545 const unsigned Opcode = getNewOpcode(CI, Paired); 1546 1547 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1548 1549 // Copy to the new source register. 1550 Register DestReg = MRI->createVirtualRegister(SuperRC); 1551 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1552 1553 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1554 1555 AddressRegs Regs = getRegs(Opcode, *TII); 1556 1557 if (Regs.VAddr) 1558 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1559 1560 unsigned JoinedFormat = 1561 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1562 1563 // It shouldn't be possible to get this far if the two instructions 1564 // don't have a single memoperand, because MachineInstr::mayAlias() 1565 // will return true if this is the case. 1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1567 1568 MachineInstr *New = 1569 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1570 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1571 .addImm(MergedOffset) // offset 1572 .addImm(JoinedFormat) // format 1573 .addImm(CI.CPol) // cpol 1574 .addImm(0) // swz 1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1576 1577 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1578 1579 CI.I->eraseFromParent(); 1580 Paired.I->eraseFromParent(); 1581 return New; 1582 } 1583 1584 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1585 CombineInfo &CI, CombineInfo &Paired, 1586 MachineBasicBlock::iterator InsertBefore) { 1587 MachineBasicBlock *MBB = CI.I->getParent(); 1588 DebugLoc DL = CI.I->getDebugLoc(); 1589 1590 const unsigned Opcode = getNewOpcode(CI, Paired); 1591 1592 Register SrcReg = 1593 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1594 1595 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1596 .addReg(SrcReg, RegState::Kill); 1597 1598 AddressRegs Regs = getRegs(Opcode, *TII); 1599 1600 if (Regs.VAddr) 1601 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1602 1603 unsigned JoinedFormat = 1604 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1605 1606 // It shouldn't be possible to get this far if the two instructions 1607 // don't have a single memoperand, because MachineInstr::mayAlias() 1608 // will return true if this is the case. 1609 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1610 1611 MachineInstr *New = 1612 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1613 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1614 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1615 .addImm(JoinedFormat) // format 1616 .addImm(CI.CPol) // cpol 1617 .addImm(0) // swz 1618 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1619 1620 CI.I->eraseFromParent(); 1621 Paired.I->eraseFromParent(); 1622 return New; 1623 } 1624 1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1626 CombineInfo &CI, CombineInfo &Paired, 1627 MachineBasicBlock::iterator InsertBefore) { 1628 MachineBasicBlock *MBB = CI.I->getParent(); 1629 DebugLoc DL = CI.I->getDebugLoc(); 1630 1631 const unsigned Opcode = getNewOpcode(CI, Paired); 1632 1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1634 Register DestReg = MRI->createVirtualRegister(SuperRC); 1635 1636 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1637 1638 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1639 MIB.add(*SAddr); 1640 1641 MachineInstr *New = 1642 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1643 .addImm(std::min(CI.Offset, Paired.Offset)) 1644 .addImm(CI.CPol) 1645 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1646 1647 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1648 1649 CI.I->eraseFromParent(); 1650 Paired.I->eraseFromParent(); 1651 return New; 1652 } 1653 1654 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1655 CombineInfo &CI, CombineInfo &Paired, 1656 MachineBasicBlock::iterator InsertBefore) { 1657 MachineBasicBlock *MBB = CI.I->getParent(); 1658 DebugLoc DL = CI.I->getDebugLoc(); 1659 1660 const unsigned Opcode = getNewOpcode(CI, Paired); 1661 1662 Register SrcReg = 1663 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1664 1665 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1666 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1667 .addReg(SrcReg, RegState::Kill); 1668 1669 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1670 MIB.add(*SAddr); 1671 1672 MachineInstr *New = 1673 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1674 .addImm(CI.CPol) 1675 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1676 1677 CI.I->eraseFromParent(); 1678 Paired.I->eraseFromParent(); 1679 return New; 1680 } 1681 1682 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1683 const CombineInfo &Paired) { 1684 const unsigned Width = CI.Width + Paired.Width; 1685 1686 switch (getCommonInstClass(CI, Paired)) { 1687 default: 1688 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1689 // FIXME: Handle d16 correctly 1690 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1691 Width); 1692 case TBUFFER_LOAD: 1693 case TBUFFER_STORE: 1694 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1695 Width); 1696 1697 case UNKNOWN: 1698 llvm_unreachable("Unknown instruction class"); 1699 case S_BUFFER_LOAD_IMM: 1700 switch (Width) { 1701 default: 1702 return 0; 1703 case 2: 1704 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1705 case 3: 1706 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1707 case 4: 1708 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1709 case 8: 1710 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1711 } 1712 case S_BUFFER_LOAD_SGPR_IMM: 1713 switch (Width) { 1714 default: 1715 return 0; 1716 case 2: 1717 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1718 case 3: 1719 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1720 case 4: 1721 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1722 case 8: 1723 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1724 } 1725 case S_LOAD_IMM: { 1726 // If XNACK is enabled, use the constrained opcodes when the first load is 1727 // under-aligned. 1728 const MachineMemOperand *MMO = *CI.I->memoperands_begin(); 1729 bool NeedsConstrainedOpc = 1730 STM->isXNACKEnabled() && MMO->getAlign().value() < Width * 4; 1731 switch (Width) { 1732 default: 1733 return 0; 1734 case 2: 1735 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec 1736 : AMDGPU::S_LOAD_DWORDX2_IMM; 1737 case 3: 1738 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec 1739 : AMDGPU::S_LOAD_DWORDX3_IMM; 1740 case 4: 1741 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec 1742 : AMDGPU::S_LOAD_DWORDX4_IMM; 1743 case 8: 1744 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec 1745 : AMDGPU::S_LOAD_DWORDX8_IMM; 1746 } 1747 } 1748 case GLOBAL_LOAD: 1749 switch (Width) { 1750 default: 1751 return 0; 1752 case 2: 1753 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1754 case 3: 1755 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1756 case 4: 1757 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1758 } 1759 case GLOBAL_LOAD_SADDR: 1760 switch (Width) { 1761 default: 1762 return 0; 1763 case 2: 1764 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1765 case 3: 1766 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1767 case 4: 1768 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1769 } 1770 case GLOBAL_STORE: 1771 switch (Width) { 1772 default: 1773 return 0; 1774 case 2: 1775 return AMDGPU::GLOBAL_STORE_DWORDX2; 1776 case 3: 1777 return AMDGPU::GLOBAL_STORE_DWORDX3; 1778 case 4: 1779 return AMDGPU::GLOBAL_STORE_DWORDX4; 1780 } 1781 case GLOBAL_STORE_SADDR: 1782 switch (Width) { 1783 default: 1784 return 0; 1785 case 2: 1786 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1787 case 3: 1788 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1789 case 4: 1790 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1791 } 1792 case FLAT_LOAD: 1793 switch (Width) { 1794 default: 1795 return 0; 1796 case 2: 1797 return AMDGPU::FLAT_LOAD_DWORDX2; 1798 case 3: 1799 return AMDGPU::FLAT_LOAD_DWORDX3; 1800 case 4: 1801 return AMDGPU::FLAT_LOAD_DWORDX4; 1802 } 1803 case FLAT_STORE: 1804 switch (Width) { 1805 default: 1806 return 0; 1807 case 2: 1808 return AMDGPU::FLAT_STORE_DWORDX2; 1809 case 3: 1810 return AMDGPU::FLAT_STORE_DWORDX3; 1811 case 4: 1812 return AMDGPU::FLAT_STORE_DWORDX4; 1813 } 1814 case MIMG: 1815 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1816 "No overlaps"); 1817 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1818 } 1819 } 1820 1821 std::pair<unsigned, unsigned> 1822 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1823 const CombineInfo &Paired) { 1824 assert((CI.InstClass != MIMG || 1825 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1826 CI.Width + Paired.Width)) && 1827 "No overlaps"); 1828 1829 unsigned Idx0; 1830 unsigned Idx1; 1831 1832 static const unsigned Idxs[5][4] = { 1833 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1834 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1835 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1836 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1837 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1838 }; 1839 1840 assert(CI.Width >= 1 && CI.Width <= 4); 1841 assert(Paired.Width >= 1 && Paired.Width <= 4); 1842 1843 if (Paired < CI) { 1844 Idx1 = Idxs[0][Paired.Width - 1]; 1845 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1846 } else { 1847 Idx0 = Idxs[0][CI.Width - 1]; 1848 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1849 } 1850 1851 return {Idx0, Idx1}; 1852 } 1853 1854 const TargetRegisterClass * 1855 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1856 const CombineInfo &Paired) const { 1857 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1858 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1859 switch (CI.Width + Paired.Width) { 1860 default: 1861 return nullptr; 1862 case 2: 1863 return &AMDGPU::SReg_64_XEXECRegClass; 1864 case 3: 1865 return &AMDGPU::SGPR_96RegClass; 1866 case 4: 1867 return &AMDGPU::SGPR_128RegClass; 1868 case 8: 1869 return &AMDGPU::SGPR_256RegClass; 1870 case 16: 1871 return &AMDGPU::SGPR_512RegClass; 1872 } 1873 } 1874 1875 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1876 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1877 ? TRI->getAGPRClassForBitWidth(BitWidth) 1878 : TRI->getVGPRClassForBitWidth(BitWidth); 1879 } 1880 1881 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1882 CombineInfo &CI, CombineInfo &Paired, 1883 MachineBasicBlock::iterator InsertBefore) { 1884 MachineBasicBlock *MBB = CI.I->getParent(); 1885 DebugLoc DL = CI.I->getDebugLoc(); 1886 1887 const unsigned Opcode = getNewOpcode(CI, Paired); 1888 1889 Register SrcReg = 1890 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1891 1892 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1893 .addReg(SrcReg, RegState::Kill); 1894 1895 AddressRegs Regs = getRegs(Opcode, *TII); 1896 1897 if (Regs.VAddr) 1898 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1899 1900 1901 // It shouldn't be possible to get this far if the two instructions 1902 // don't have a single memoperand, because MachineInstr::mayAlias() 1903 // will return true if this is the case. 1904 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1905 1906 MachineInstr *New = 1907 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1908 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1909 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1910 .addImm(CI.CPol) // cpol 1911 .addImm(0) // swz 1912 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1913 1914 CI.I->eraseFromParent(); 1915 Paired.I->eraseFromParent(); 1916 return New; 1917 } 1918 1919 MachineOperand 1920 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1921 APInt V(32, Val, true); 1922 if (TII->isInlineConstant(V)) 1923 return MachineOperand::CreateImm(Val); 1924 1925 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1926 MachineInstr *Mov = 1927 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1928 TII->get(AMDGPU::S_MOV_B32), Reg) 1929 .addImm(Val); 1930 (void)Mov; 1931 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1932 return MachineOperand::CreateReg(Reg, false); 1933 } 1934 1935 // Compute base address using Addr and return the final register. 1936 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1937 const MemAddress &Addr) const { 1938 MachineBasicBlock *MBB = MI.getParent(); 1939 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1940 DebugLoc DL = MI.getDebugLoc(); 1941 1942 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1943 Addr.Base.LoSubReg) && 1944 "Expected 32-bit Base-Register-Low!!"); 1945 1946 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1947 Addr.Base.HiSubReg) && 1948 "Expected 32-bit Base-Register-Hi!!"); 1949 1950 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1951 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1952 MachineOperand OffsetHi = 1953 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1954 1955 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1956 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1957 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1958 1959 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1960 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1961 MachineInstr *LoHalf = 1962 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1963 .addReg(CarryReg, RegState::Define) 1964 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1965 .add(OffsetLo) 1966 .addImm(0); // clamp bit 1967 (void)LoHalf; 1968 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1969 1970 MachineInstr *HiHalf = 1971 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1972 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1973 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1974 .add(OffsetHi) 1975 .addReg(CarryReg, RegState::Kill) 1976 .addImm(0); // clamp bit 1977 (void)HiHalf; 1978 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1979 1980 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1981 MachineInstr *FullBase = 1982 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1983 .addReg(DestSub0) 1984 .addImm(AMDGPU::sub0) 1985 .addReg(DestSub1) 1986 .addImm(AMDGPU::sub1); 1987 (void)FullBase; 1988 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1989 1990 return FullDestReg; 1991 } 1992 1993 // Update base and offset with the NewBase and NewOffset in MI. 1994 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1995 Register NewBase, 1996 int32_t NewOffset) const { 1997 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1998 Base->setReg(NewBase); 1999 Base->setIsKill(false); 2000 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2001 } 2002 2003 std::optional<int32_t> 2004 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2005 if (Op.isImm()) 2006 return Op.getImm(); 2007 2008 if (!Op.isReg()) 2009 return std::nullopt; 2010 2011 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2012 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2013 !Def->getOperand(1).isImm()) 2014 return std::nullopt; 2015 2016 return Def->getOperand(1).getImm(); 2017 } 2018 2019 // Analyze Base and extracts: 2020 // - 32bit base registers, subregisters 2021 // - 64bit constant offset 2022 // Expecting base computation as: 2023 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2024 // %LO:vgpr_32, %c:sreg_64_xexec = 2025 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2026 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2027 // %Base:vreg_64 = 2028 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2029 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2030 MemAddress &Addr) const { 2031 if (!Base.isReg()) 2032 return; 2033 2034 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2035 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2036 || Def->getNumOperands() != 5) 2037 return; 2038 2039 MachineOperand BaseLo = Def->getOperand(1); 2040 MachineOperand BaseHi = Def->getOperand(3); 2041 if (!BaseLo.isReg() || !BaseHi.isReg()) 2042 return; 2043 2044 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2045 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2046 2047 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2048 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2049 return; 2050 2051 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2052 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2053 2054 auto Offset0P = extractConstOffset(*Src0); 2055 if (Offset0P) 2056 BaseLo = *Src1; 2057 else { 2058 if (!(Offset0P = extractConstOffset(*Src1))) 2059 return; 2060 BaseLo = *Src0; 2061 } 2062 2063 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2064 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2065 2066 if (Src0->isImm()) 2067 std::swap(Src0, Src1); 2068 2069 if (!Src1->isImm() || Src0->isImm()) 2070 return; 2071 2072 uint64_t Offset1 = Src1->getImm(); 2073 BaseHi = *Src0; 2074 2075 Addr.Base.LoReg = BaseLo.getReg(); 2076 Addr.Base.HiReg = BaseHi.getReg(); 2077 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2078 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2079 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2080 } 2081 2082 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2083 MachineInstr &MI, 2084 MemInfoMap &Visited, 2085 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2086 2087 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 2088 return false; 2089 2090 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2091 if (SIInstrInfo::isFLATScratch(MI)) 2092 return false; 2093 2094 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2095 : AMDGPUAS::FLAT_ADDRESS; 2096 2097 if (AnchorList.count(&MI)) 2098 return false; 2099 2100 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2101 2102 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2103 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2104 return false; 2105 } 2106 2107 // Step1: Find the base-registers and a 64bit constant offset. 2108 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2109 MemAddress MAddr; 2110 if (!Visited.contains(&MI)) { 2111 processBaseWithConstOffset(Base, MAddr); 2112 Visited[&MI] = MAddr; 2113 } else 2114 MAddr = Visited[&MI]; 2115 2116 if (MAddr.Offset == 0) { 2117 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2118 " constant offsets that can be promoted.\n";); 2119 return false; 2120 } 2121 2122 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2123 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2124 2125 // Step2: Traverse through MI's basic block and find an anchor(that has the 2126 // same base-registers) with the highest 13bit distance from MI's offset. 2127 // E.g. (64bit loads) 2128 // bb: 2129 // addr1 = &a + 4096; load1 = load(addr1, 0) 2130 // addr2 = &a + 6144; load2 = load(addr2, 0) 2131 // addr3 = &a + 8192; load3 = load(addr3, 0) 2132 // addr4 = &a + 10240; load4 = load(addr4, 0) 2133 // addr5 = &a + 12288; load5 = load(addr5, 0) 2134 // 2135 // Starting from the first load, the optimization will try to find a new base 2136 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2137 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2138 // as the new-base(anchor) because of the maximum distance which can 2139 // accommodate more intermediate bases presumably. 2140 // 2141 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2142 // (&a + 8192) for load1, load2, load4. 2143 // addr = &a + 8192 2144 // load1 = load(addr, -4096) 2145 // load2 = load(addr, -2048) 2146 // load3 = load(addr, 0) 2147 // load4 = load(addr, 2048) 2148 // addr5 = &a + 12288; load5 = load(addr5, 0) 2149 // 2150 MachineInstr *AnchorInst = nullptr; 2151 MemAddress AnchorAddr; 2152 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2153 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2154 2155 MachineBasicBlock *MBB = MI.getParent(); 2156 MachineBasicBlock::iterator E = MBB->end(); 2157 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2158 ++MBBI; 2159 const SITargetLowering *TLI = 2160 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2161 2162 for ( ; MBBI != E; ++MBBI) { 2163 MachineInstr &MINext = *MBBI; 2164 // TODO: Support finding an anchor(with same base) from store addresses or 2165 // any other load addresses where the opcodes are different. 2166 if (MINext.getOpcode() != MI.getOpcode() || 2167 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2168 continue; 2169 2170 const MachineOperand &BaseNext = 2171 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2172 MemAddress MAddrNext; 2173 if (!Visited.contains(&MINext)) { 2174 processBaseWithConstOffset(BaseNext, MAddrNext); 2175 Visited[&MINext] = MAddrNext; 2176 } else 2177 MAddrNext = Visited[&MINext]; 2178 2179 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2180 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2181 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2182 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2183 continue; 2184 2185 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2186 2187 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2188 TargetLoweringBase::AddrMode AM; 2189 AM.HasBaseReg = true; 2190 AM.BaseOffs = Dist; 2191 if (TLI->isLegalFlatAddressingMode(AM, AS) && 2192 (uint32_t)std::abs(Dist) > MaxDist) { 2193 MaxDist = std::abs(Dist); 2194 2195 AnchorAddr = MAddrNext; 2196 AnchorInst = &MINext; 2197 } 2198 } 2199 2200 if (AnchorInst) { 2201 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2202 AnchorInst->dump()); 2203 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2204 << AnchorAddr.Offset << "\n\n"); 2205 2206 // Instead of moving up, just re-compute anchor-instruction's base address. 2207 Register Base = computeBase(MI, AnchorAddr); 2208 2209 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2210 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2211 2212 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2213 TargetLoweringBase::AddrMode AM; 2214 AM.HasBaseReg = true; 2215 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2216 2217 if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2218 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2219 OtherMI->dump()); 2220 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2221 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2222 } 2223 } 2224 AnchorList.insert(AnchorInst); 2225 return true; 2226 } 2227 2228 return false; 2229 } 2230 2231 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2232 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2233 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2234 if (AddrList.front().InstClass == CI.InstClass && 2235 AddrList.front().IsAGPR == CI.IsAGPR && 2236 AddrList.front().hasSameBaseAddress(CI)) { 2237 AddrList.emplace_back(CI); 2238 return; 2239 } 2240 } 2241 2242 // Base address not found, so add a new list. 2243 MergeableInsts.emplace_back(1, CI); 2244 } 2245 2246 std::pair<MachineBasicBlock::iterator, bool> 2247 SILoadStoreOptimizer::collectMergeableInsts( 2248 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2249 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2250 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2251 bool Modified = false; 2252 2253 // Sort potential mergeable instructions into lists. One list per base address. 2254 unsigned Order = 0; 2255 MachineBasicBlock::iterator BlockI = Begin; 2256 for (; BlockI != End; ++BlockI) { 2257 MachineInstr &MI = *BlockI; 2258 2259 // We run this before checking if an address is mergeable, because it can produce 2260 // better code even if the instructions aren't mergeable. 2261 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2262 Modified = true; 2263 2264 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2265 // barriers. We can look after this barrier for separate merges. 2266 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2267 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2268 2269 // Search will resume after this instruction in a separate merge list. 2270 ++BlockI; 2271 break; 2272 } 2273 2274 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2275 if (InstClass == UNKNOWN) 2276 continue; 2277 2278 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2279 int Swizzled = 2280 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2281 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2282 continue; 2283 2284 CombineInfo CI; 2285 CI.setMI(MI, *this); 2286 CI.Order = Order++; 2287 2288 if (!CI.hasMergeableAddress(*MRI)) 2289 continue; 2290 2291 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2292 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2293 // operands. However we are reporting that ds_write2 shall have 2294 // only VGPR data so that machine copy propagation does not 2295 // create an illegal instruction with a VGPR and AGPR sources. 2296 // Consequenctially if we create such instruction the verifier 2297 // will complain. 2298 continue; 2299 } 2300 2301 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2302 2303 addInstToMergeableList(CI, MergeableInsts); 2304 } 2305 2306 // At this point we have lists of Mergeable instructions. 2307 // 2308 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2309 // list try to find an instruction that can be merged with I. If an instruction 2310 // is found, it is stored in the Paired field. If no instructions are found, then 2311 // the CombineInfo object is deleted from the list. 2312 2313 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2314 E = MergeableInsts.end(); I != E;) { 2315 2316 std::list<CombineInfo> &MergeList = *I; 2317 if (MergeList.size() <= 1) { 2318 // This means we have found only one instruction with a given address 2319 // that can be merged, and we need at least 2 instructions to do a merge, 2320 // so this list can be discarded. 2321 I = MergeableInsts.erase(I); 2322 continue; 2323 } 2324 2325 // Sort the lists by offsets, this way mergeable instructions will be 2326 // adjacent to each other in the list, which will make it easier to find 2327 // matches. 2328 MergeList.sort( 2329 [] (const CombineInfo &A, const CombineInfo &B) { 2330 return A.Offset < B.Offset; 2331 }); 2332 ++I; 2333 } 2334 2335 return {BlockI, Modified}; 2336 } 2337 2338 // Scan through looking for adjacent LDS operations with constant offsets from 2339 // the same base register. We rely on the scheduler to do the hard work of 2340 // clustering nearby loads, and assume these are all adjacent. 2341 bool SILoadStoreOptimizer::optimizeBlock( 2342 std::list<std::list<CombineInfo> > &MergeableInsts) { 2343 bool Modified = false; 2344 2345 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2346 E = MergeableInsts.end(); I != E;) { 2347 std::list<CombineInfo> &MergeList = *I; 2348 2349 bool OptimizeListAgain = false; 2350 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2351 // We weren't able to make any changes, so delete the list so we don't 2352 // process the same instructions the next time we try to optimize this 2353 // block. 2354 I = MergeableInsts.erase(I); 2355 continue; 2356 } 2357 2358 Modified = true; 2359 2360 // We made changes, but also determined that there were no more optimization 2361 // opportunities, so we don't need to reprocess the list 2362 if (!OptimizeListAgain) { 2363 I = MergeableInsts.erase(I); 2364 continue; 2365 } 2366 OptimizeAgain = true; 2367 } 2368 return Modified; 2369 } 2370 2371 bool 2372 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2373 std::list<CombineInfo> &MergeList, 2374 bool &OptimizeListAgain) { 2375 if (MergeList.empty()) 2376 return false; 2377 2378 bool Modified = false; 2379 2380 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2381 Next = std::next(I)) { 2382 2383 auto First = I; 2384 auto Second = Next; 2385 2386 if ((*First).Order > (*Second).Order) 2387 std::swap(First, Second); 2388 CombineInfo &CI = *First; 2389 CombineInfo &Paired = *Second; 2390 2391 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2392 if (!Where) { 2393 ++I; 2394 continue; 2395 } 2396 2397 Modified = true; 2398 2399 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2400 2401 MachineBasicBlock::iterator NewMI; 2402 switch (CI.InstClass) { 2403 default: 2404 llvm_unreachable("unknown InstClass"); 2405 break; 2406 case DS_READ: 2407 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2408 break; 2409 case DS_WRITE: 2410 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2411 break; 2412 case S_BUFFER_LOAD_IMM: 2413 case S_BUFFER_LOAD_SGPR_IMM: 2414 case S_LOAD_IMM: 2415 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2416 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2417 break; 2418 case BUFFER_LOAD: 2419 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2420 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2421 break; 2422 case BUFFER_STORE: 2423 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2424 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2425 break; 2426 case MIMG: 2427 NewMI = mergeImagePair(CI, Paired, Where->I); 2428 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2429 break; 2430 case TBUFFER_LOAD: 2431 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2432 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2433 break; 2434 case TBUFFER_STORE: 2435 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2436 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2437 break; 2438 case FLAT_LOAD: 2439 case GLOBAL_LOAD: 2440 case GLOBAL_LOAD_SADDR: 2441 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2442 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2443 break; 2444 case FLAT_STORE: 2445 case GLOBAL_STORE: 2446 case GLOBAL_STORE_SADDR: 2447 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2448 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2449 break; 2450 } 2451 CI.setMI(NewMI, *this); 2452 CI.Order = Where->Order; 2453 if (I == Second) 2454 I = Next; 2455 2456 MergeList.erase(Second); 2457 } 2458 2459 return Modified; 2460 } 2461 2462 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2463 if (skipFunction(MF.getFunction())) 2464 return false; 2465 2466 STM = &MF.getSubtarget<GCNSubtarget>(); 2467 if (!STM->loadStoreOptEnabled()) 2468 return false; 2469 2470 TII = STM->getInstrInfo(); 2471 TRI = &TII->getRegisterInfo(); 2472 2473 MRI = &MF.getRegInfo(); 2474 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2475 2476 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2477 2478 bool Modified = false; 2479 2480 // Contains the list of instructions for which constant offsets are being 2481 // promoted to the IMM. This is tracked for an entire block at time. 2482 SmallPtrSet<MachineInstr *, 4> AnchorList; 2483 MemInfoMap Visited; 2484 2485 for (MachineBasicBlock &MBB : MF) { 2486 MachineBasicBlock::iterator SectionEnd; 2487 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2488 I = SectionEnd) { 2489 bool CollectModified; 2490 std::list<std::list<CombineInfo>> MergeableInsts; 2491 2492 // First pass: Collect list of all instructions we know how to merge in a 2493 // subset of the block. 2494 std::tie(SectionEnd, CollectModified) = 2495 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2496 2497 Modified |= CollectModified; 2498 2499 do { 2500 OptimizeAgain = false; 2501 Modified |= optimizeBlock(MergeableInsts); 2502 } while (OptimizeAgain); 2503 } 2504 2505 Visited.clear(); 2506 AnchorList.clear(); 2507 } 2508 2509 return Modified; 2510 } 2511