1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge instructions with other physical reg 165 // addresses too. 166 if (AddrOp->getReg().isPhysical() && 167 AddrOp->getReg() != AMDGPU::SGPR_NULL) 168 return false; 169 170 // If an address has only one use then there will be no other 171 // instructions with the same address, so we can't merge this one. 172 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 173 return false; 174 } 175 return true; 176 } 177 178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 179 180 // Compare by pointer order. 181 bool operator<(const CombineInfo& Other) const { 182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 183 } 184 }; 185 186 struct BaseRegisters { 187 Register LoReg; 188 Register HiReg; 189 190 unsigned LoSubReg = 0; 191 unsigned HiSubReg = 0; 192 }; 193 194 struct MemAddress { 195 BaseRegisters Base; 196 int64_t Offset = 0; 197 }; 198 199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 200 201 private: 202 const GCNSubtarget *STM = nullptr; 203 const SIInstrInfo *TII = nullptr; 204 const SIRegisterInfo *TRI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 210 const DenseSet<Register> &ARegUses, 211 const MachineInstr &A, const MachineInstr &B) const; 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 225 226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator 231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 MachineBasicBlock::iterator InsertBefore); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 MachineBasicBlock::iterator 243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 MachineBasicBlock::iterator InsertBefore); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 264 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 265 int32_t NewOffset) const; 266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 270 /// Promotes constant offset to the immediate by adjusting the base. It 271 /// tries to use a base from the nearby instructions that allows it to have 272 /// a 13bit constant offset which gets promoted to the immediate. 273 bool promoteConstantOffsetToImm(MachineInstr &CI, 274 MemInfoMap &Visited, 275 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 276 void addInstToMergeableList(const CombineInfo &CI, 277 std::list<std::list<CombineInfo> > &MergeableInsts) const; 278 279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 280 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 282 std::list<std::list<CombineInfo>> &MergeableInsts) const; 283 284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 285 const CombineInfo &Paired); 286 287 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 288 const CombineInfo &Paired); 289 290 public: 291 static char ID; 292 293 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 294 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 295 } 296 297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 298 bool &OptimizeListAgain); 299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 300 301 bool runOnMachineFunction(MachineFunction &MF) override; 302 303 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 304 305 void getAnalysisUsage(AnalysisUsage &AU) const override { 306 AU.setPreservesCFG(); 307 AU.addRequired<AAResultsWrapperPass>(); 308 309 MachineFunctionPass::getAnalysisUsage(AU); 310 } 311 312 MachineFunctionProperties getRequiredProperties() const override { 313 return MachineFunctionProperties() 314 .set(MachineFunctionProperties::Property::IsSSA); 315 } 316 }; 317 318 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 319 const unsigned Opc = MI.getOpcode(); 320 321 if (TII.isMUBUF(Opc)) { 322 // FIXME: Handle d16 correctly 323 return AMDGPU::getMUBUFElements(Opc); 324 } 325 if (TII.isImage(MI)) { 326 uint64_t DMaskImm = 327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 328 return llvm::popcount(DMaskImm); 329 } 330 if (TII.isMTBUF(Opc)) { 331 return AMDGPU::getMTBUFElements(Opc); 332 } 333 334 switch (Opc) { 335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 337 case AMDGPU::S_LOAD_DWORD_IMM: 338 case AMDGPU::GLOBAL_LOAD_DWORD: 339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 340 case AMDGPU::GLOBAL_STORE_DWORD: 341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 342 case AMDGPU::FLAT_LOAD_DWORD: 343 case AMDGPU::FLAT_STORE_DWORD: 344 return 1; 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 357 case AMDGPU::S_LOAD_DWORDX3_IMM: 358 case AMDGPU::GLOBAL_LOAD_DWORDX3: 359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 360 case AMDGPU::GLOBAL_STORE_DWORDX3: 361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 362 case AMDGPU::FLAT_LOAD_DWORDX3: 363 case AMDGPU::FLAT_STORE_DWORDX3: 364 return 3; 365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 367 case AMDGPU::S_LOAD_DWORDX4_IMM: 368 case AMDGPU::GLOBAL_LOAD_DWORDX4: 369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 370 case AMDGPU::GLOBAL_STORE_DWORDX4: 371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 372 case AMDGPU::FLAT_LOAD_DWORDX4: 373 case AMDGPU::FLAT_STORE_DWORDX4: 374 return 4; 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 377 case AMDGPU::S_LOAD_DWORDX8_IMM: 378 return 8; 379 case AMDGPU::DS_READ_B32: [[fallthrough]]; 380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 return 1; 384 case AMDGPU::DS_READ_B64: [[fallthrough]]; 385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 386 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return 2; 389 default: 390 return 0; 391 } 392 } 393 394 /// Maps instruction opcode to enum InstClassEnum. 395 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 396 switch (Opc) { 397 default: 398 if (TII.isMUBUF(Opc)) { 399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 400 default: 401 return UNKNOWN; 402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 403 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 405 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 406 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 407 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 408 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 409 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 410 return BUFFER_LOAD; 411 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 412 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 413 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 414 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 415 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 416 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 417 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 418 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 419 return BUFFER_STORE; 420 } 421 } 422 if (TII.isImage(Opc)) { 423 // Ignore instructions encoded without vaddr. 424 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 425 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 426 return UNKNOWN; 427 // Ignore BVH instructions 428 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 429 return UNKNOWN; 430 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 431 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 432 TII.isGather4(Opc)) 433 return UNKNOWN; 434 return MIMG; 435 } 436 if (TII.isMTBUF(Opc)) { 437 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 438 default: 439 return UNKNOWN; 440 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 441 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 442 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 443 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 444 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 445 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 446 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 447 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 448 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 449 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 450 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 451 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 452 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 453 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 454 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 455 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 456 return TBUFFER_LOAD; 457 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 458 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 459 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 460 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 461 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 462 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 463 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 464 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 465 return TBUFFER_STORE; 466 } 467 } 468 return UNKNOWN; 469 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 470 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 471 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 472 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 473 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 474 return S_BUFFER_LOAD_IMM; 475 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 476 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 477 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 478 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 479 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 480 return S_BUFFER_LOAD_SGPR_IMM; 481 case AMDGPU::S_LOAD_DWORD_IMM: 482 case AMDGPU::S_LOAD_DWORDX2_IMM: 483 case AMDGPU::S_LOAD_DWORDX3_IMM: 484 case AMDGPU::S_LOAD_DWORDX4_IMM: 485 case AMDGPU::S_LOAD_DWORDX8_IMM: 486 return S_LOAD_IMM; 487 case AMDGPU::DS_READ_B32: 488 case AMDGPU::DS_READ_B32_gfx9: 489 case AMDGPU::DS_READ_B64: 490 case AMDGPU::DS_READ_B64_gfx9: 491 return DS_READ; 492 case AMDGPU::DS_WRITE_B32: 493 case AMDGPU::DS_WRITE_B32_gfx9: 494 case AMDGPU::DS_WRITE_B64: 495 case AMDGPU::DS_WRITE_B64_gfx9: 496 return DS_WRITE; 497 case AMDGPU::GLOBAL_LOAD_DWORD: 498 case AMDGPU::GLOBAL_LOAD_DWORDX2: 499 case AMDGPU::GLOBAL_LOAD_DWORDX3: 500 case AMDGPU::GLOBAL_LOAD_DWORDX4: 501 case AMDGPU::FLAT_LOAD_DWORD: 502 case AMDGPU::FLAT_LOAD_DWORDX2: 503 case AMDGPU::FLAT_LOAD_DWORDX3: 504 case AMDGPU::FLAT_LOAD_DWORDX4: 505 return FLAT_LOAD; 506 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 507 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 508 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 509 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 510 return GLOBAL_LOAD_SADDR; 511 case AMDGPU::GLOBAL_STORE_DWORD: 512 case AMDGPU::GLOBAL_STORE_DWORDX2: 513 case AMDGPU::GLOBAL_STORE_DWORDX3: 514 case AMDGPU::GLOBAL_STORE_DWORDX4: 515 case AMDGPU::FLAT_STORE_DWORD: 516 case AMDGPU::FLAT_STORE_DWORDX2: 517 case AMDGPU::FLAT_STORE_DWORDX3: 518 case AMDGPU::FLAT_STORE_DWORDX4: 519 return FLAT_STORE; 520 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 521 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 522 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 523 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 524 return GLOBAL_STORE_SADDR; 525 } 526 } 527 528 /// Determines instruction subclass from opcode. Only instructions 529 /// of the same subclass can be merged together. The merged instruction may have 530 /// a different subclass but must have the same class. 531 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 532 switch (Opc) { 533 default: 534 if (TII.isMUBUF(Opc)) 535 return AMDGPU::getMUBUFBaseOpcode(Opc); 536 if (TII.isImage(Opc)) { 537 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 538 assert(Info); 539 return Info->BaseOpcode; 540 } 541 if (TII.isMTBUF(Opc)) 542 return AMDGPU::getMTBUFBaseOpcode(Opc); 543 return -1; 544 case AMDGPU::DS_READ_B32: 545 case AMDGPU::DS_READ_B32_gfx9: 546 case AMDGPU::DS_READ_B64: 547 case AMDGPU::DS_READ_B64_gfx9: 548 case AMDGPU::DS_WRITE_B32: 549 case AMDGPU::DS_WRITE_B32_gfx9: 550 case AMDGPU::DS_WRITE_B64: 551 case AMDGPU::DS_WRITE_B64_gfx9: 552 return Opc; 553 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 554 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 555 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 556 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 557 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 558 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 559 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 560 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 561 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 562 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 563 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 564 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 565 case AMDGPU::S_LOAD_DWORD_IMM: 566 case AMDGPU::S_LOAD_DWORDX2_IMM: 567 case AMDGPU::S_LOAD_DWORDX3_IMM: 568 case AMDGPU::S_LOAD_DWORDX4_IMM: 569 case AMDGPU::S_LOAD_DWORDX8_IMM: 570 return AMDGPU::S_LOAD_DWORD_IMM; 571 case AMDGPU::GLOBAL_LOAD_DWORD: 572 case AMDGPU::GLOBAL_LOAD_DWORDX2: 573 case AMDGPU::GLOBAL_LOAD_DWORDX3: 574 case AMDGPU::GLOBAL_LOAD_DWORDX4: 575 case AMDGPU::FLAT_LOAD_DWORD: 576 case AMDGPU::FLAT_LOAD_DWORDX2: 577 case AMDGPU::FLAT_LOAD_DWORDX3: 578 case AMDGPU::FLAT_LOAD_DWORDX4: 579 return AMDGPU::FLAT_LOAD_DWORD; 580 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 581 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 582 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 583 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 584 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 585 case AMDGPU::GLOBAL_STORE_DWORD: 586 case AMDGPU::GLOBAL_STORE_DWORDX2: 587 case AMDGPU::GLOBAL_STORE_DWORDX3: 588 case AMDGPU::GLOBAL_STORE_DWORDX4: 589 case AMDGPU::FLAT_STORE_DWORD: 590 case AMDGPU::FLAT_STORE_DWORDX2: 591 case AMDGPU::FLAT_STORE_DWORDX3: 592 case AMDGPU::FLAT_STORE_DWORDX4: 593 return AMDGPU::FLAT_STORE_DWORD; 594 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 595 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 596 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 597 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 598 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 599 } 600 } 601 602 // GLOBAL loads and stores are classified as FLAT initially. If both combined 603 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 604 // If either or both instructions are non segment specific FLAT the resulting 605 // combined operation will be FLAT, potentially promoting one of the GLOBAL 606 // operations to FLAT. 607 // For other instructions return the original unmodified class. 608 InstClassEnum 609 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 610 const CombineInfo &Paired) { 611 assert(CI.InstClass == Paired.InstClass); 612 613 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 614 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 615 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 616 617 return CI.InstClass; 618 } 619 620 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 621 AddressRegs Result; 622 623 if (TII.isMUBUF(Opc)) { 624 if (AMDGPU::getMUBUFHasVAddr(Opc)) 625 Result.VAddr = true; 626 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 627 Result.SRsrc = true; 628 if (AMDGPU::getMUBUFHasSoffset(Opc)) 629 Result.SOffset = true; 630 631 return Result; 632 } 633 634 if (TII.isImage(Opc)) { 635 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 636 if (VAddr0Idx >= 0) { 637 int RsrcName = 638 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 639 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 640 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 641 } else { 642 Result.VAddr = true; 643 } 644 Result.SRsrc = true; 645 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 646 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 647 Result.SSamp = true; 648 649 return Result; 650 } 651 if (TII.isMTBUF(Opc)) { 652 if (AMDGPU::getMTBUFHasVAddr(Opc)) 653 Result.VAddr = true; 654 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 655 Result.SRsrc = true; 656 if (AMDGPU::getMTBUFHasSoffset(Opc)) 657 Result.SOffset = true; 658 659 return Result; 660 } 661 662 switch (Opc) { 663 default: 664 return Result; 665 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 666 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 667 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 668 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 669 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 670 Result.SOffset = true; 671 [[fallthrough]]; 672 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 673 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 674 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 675 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 676 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 677 case AMDGPU::S_LOAD_DWORD_IMM: 678 case AMDGPU::S_LOAD_DWORDX2_IMM: 679 case AMDGPU::S_LOAD_DWORDX3_IMM: 680 case AMDGPU::S_LOAD_DWORDX4_IMM: 681 case AMDGPU::S_LOAD_DWORDX8_IMM: 682 Result.SBase = true; 683 return Result; 684 case AMDGPU::DS_READ_B32: 685 case AMDGPU::DS_READ_B64: 686 case AMDGPU::DS_READ_B32_gfx9: 687 case AMDGPU::DS_READ_B64_gfx9: 688 case AMDGPU::DS_WRITE_B32: 689 case AMDGPU::DS_WRITE_B64: 690 case AMDGPU::DS_WRITE_B32_gfx9: 691 case AMDGPU::DS_WRITE_B64_gfx9: 692 Result.Addr = true; 693 return Result; 694 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 695 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 696 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 697 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 698 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 699 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 700 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 701 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 702 Result.SAddr = true; 703 [[fallthrough]]; 704 case AMDGPU::GLOBAL_LOAD_DWORD: 705 case AMDGPU::GLOBAL_LOAD_DWORDX2: 706 case AMDGPU::GLOBAL_LOAD_DWORDX3: 707 case AMDGPU::GLOBAL_LOAD_DWORDX4: 708 case AMDGPU::GLOBAL_STORE_DWORD: 709 case AMDGPU::GLOBAL_STORE_DWORDX2: 710 case AMDGPU::GLOBAL_STORE_DWORDX3: 711 case AMDGPU::GLOBAL_STORE_DWORDX4: 712 case AMDGPU::FLAT_LOAD_DWORD: 713 case AMDGPU::FLAT_LOAD_DWORDX2: 714 case AMDGPU::FLAT_LOAD_DWORDX3: 715 case AMDGPU::FLAT_LOAD_DWORDX4: 716 case AMDGPU::FLAT_STORE_DWORD: 717 case AMDGPU::FLAT_STORE_DWORDX2: 718 case AMDGPU::FLAT_STORE_DWORDX3: 719 case AMDGPU::FLAT_STORE_DWORDX4: 720 Result.VAddr = true; 721 return Result; 722 } 723 } 724 725 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 726 const SILoadStoreOptimizer &LSO) { 727 I = MI; 728 unsigned Opc = MI->getOpcode(); 729 InstClass = getInstClass(Opc, *LSO.TII); 730 731 if (InstClass == UNKNOWN) 732 return; 733 734 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 735 736 switch (InstClass) { 737 case DS_READ: 738 EltSize = 739 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 740 : 4; 741 break; 742 case DS_WRITE: 743 EltSize = 744 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 745 : 4; 746 break; 747 case S_BUFFER_LOAD_IMM: 748 case S_BUFFER_LOAD_SGPR_IMM: 749 case S_LOAD_IMM: 750 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 751 break; 752 default: 753 EltSize = 4; 754 break; 755 } 756 757 if (InstClass == MIMG) { 758 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 759 // Offset is not considered for MIMG instructions. 760 Offset = 0; 761 } else { 762 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 763 Offset = I->getOperand(OffsetIdx).getImm(); 764 } 765 766 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 767 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 768 769 Width = getOpcodeWidth(*I, *LSO.TII); 770 771 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 772 Offset &= 0xffff; 773 } else if (InstClass != MIMG) { 774 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 775 } 776 777 AddressRegs Regs = getRegs(Opc, *LSO.TII); 778 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 779 780 NumAddresses = 0; 781 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 782 AddrIdx[NumAddresses++] = 783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 784 if (Regs.Addr) 785 AddrIdx[NumAddresses++] = 786 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 787 if (Regs.SBase) 788 AddrIdx[NumAddresses++] = 789 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 790 if (Regs.SRsrc) 791 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 792 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 793 if (Regs.SOffset) 794 AddrIdx[NumAddresses++] = 795 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 796 if (Regs.SAddr) 797 AddrIdx[NumAddresses++] = 798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 799 if (Regs.VAddr) 800 AddrIdx[NumAddresses++] = 801 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 802 if (Regs.SSamp) 803 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 804 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 805 assert(NumAddresses <= MaxAddressRegs); 806 807 for (unsigned J = 0; J < NumAddresses; J++) 808 AddrReg[J] = &I->getOperand(AddrIdx[J]); 809 } 810 811 } // end anonymous namespace. 812 813 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 814 "SI Load Store Optimizer", false, false) 815 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 816 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 817 false, false) 818 819 char SILoadStoreOptimizer::ID = 0; 820 821 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 822 823 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 824 return new SILoadStoreOptimizer(); 825 } 826 827 static void addDefsUsesToList(const MachineInstr &MI, 828 DenseSet<Register> &RegDefs, 829 DenseSet<Register> &RegUses) { 830 for (const auto &Op : MI.operands()) { 831 if (!Op.isReg()) 832 continue; 833 if (Op.isDef()) 834 RegDefs.insert(Op.getReg()); 835 if (Op.readsReg()) 836 RegUses.insert(Op.getReg()); 837 } 838 } 839 840 bool SILoadStoreOptimizer::canSwapInstructions( 841 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 842 const MachineInstr &A, const MachineInstr &B) const { 843 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 844 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 845 return false; 846 for (const auto &BOp : B.operands()) { 847 if (!BOp.isReg()) 848 continue; 849 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 850 return false; 851 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 852 return false; 853 } 854 return true; 855 } 856 857 // Given that \p CI and \p Paired are adjacent memory operations produce a new 858 // MMO for the combined operation with a new access size. 859 MachineMemOperand * 860 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 861 const CombineInfo &Paired) { 862 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 863 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 864 865 unsigned Size = MMOa->getSize() + MMOb->getSize(); 866 867 // A base pointer for the combined operation is the same as the leading 868 // operation's pointer. 869 if (Paired < CI) 870 std::swap(MMOa, MMOb); 871 872 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 873 // If merging FLAT and GLOBAL set address space to FLAT. 874 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 875 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 876 877 MachineFunction *MF = CI.I->getMF(); 878 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 879 } 880 881 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 882 const SIInstrInfo &TII, 883 const CombineInfo &Paired) { 884 assert(CI.InstClass == MIMG); 885 886 // Ignore instructions with tfe/lwe set. 887 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 888 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 889 890 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 891 return false; 892 893 // Check other optional immediate operands for equality. 894 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 895 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 896 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 897 898 for (auto op : OperandsToMatch) { 899 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 900 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 901 return false; 902 if (Idx != -1 && 903 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 904 return false; 905 } 906 907 // Check DMask for overlaps. 908 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 909 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 910 911 if (!MaxMask) 912 return false; 913 914 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 915 if ((1u << AllowedBitsForMin) <= MinMask) 916 return false; 917 918 return true; 919 } 920 921 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 922 unsigned ComponentCount, 923 const GCNSubtarget &STI) { 924 if (ComponentCount > 4) 925 return 0; 926 927 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 928 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 929 if (!OldFormatInfo) 930 return 0; 931 932 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 933 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 934 ComponentCount, 935 OldFormatInfo->NumFormat, STI); 936 937 if (!NewFormatInfo) 938 return 0; 939 940 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 941 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 942 943 return NewFormatInfo->Format; 944 } 945 946 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 947 // highest power of two. Note that the result is well defined for all inputs 948 // including corner cases like: 949 // - if Lo == Hi, return that value 950 // - if Lo == 0, return 0 (even though the "- 1" below underflows 951 // - if Lo > Hi, return 0 (as if the range wrapped around) 952 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 953 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 954 } 955 956 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 957 const GCNSubtarget &STI, 958 CombineInfo &Paired, 959 bool Modify) { 960 assert(CI.InstClass != MIMG); 961 962 // XXX - Would the same offset be OK? Is there any reason this would happen or 963 // be useful? 964 if (CI.Offset == Paired.Offset) 965 return false; 966 967 // This won't be valid if the offset isn't aligned. 968 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 969 return false; 970 971 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 972 973 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 974 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 975 if (!Info0) 976 return false; 977 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 978 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 979 if (!Info1) 980 return false; 981 982 if (Info0->BitsPerComp != Info1->BitsPerComp || 983 Info0->NumFormat != Info1->NumFormat) 984 return false; 985 986 // TODO: Should be possible to support more formats, but if format loads 987 // are not dword-aligned, the merged load might not be valid. 988 if (Info0->BitsPerComp != 32) 989 return false; 990 991 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 992 return false; 993 } 994 995 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 996 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 997 CI.UseST64 = false; 998 CI.BaseOff = 0; 999 1000 // Handle all non-DS instructions. 1001 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1002 if (EltOffset0 + CI.Width != EltOffset1 && 1003 EltOffset1 + Paired.Width != EltOffset0) 1004 return false; 1005 if (CI.CPol != Paired.CPol) 1006 return false; 1007 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1008 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1009 // Reject cases like: 1010 // dword + dwordx2 -> dwordx3 1011 // dword + dwordx3 -> dwordx4 1012 // If we tried to combine these cases, we would fail to extract a subreg 1013 // for the result of the second load due to SGPR alignment requirements. 1014 if (CI.Width != Paired.Width && 1015 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1016 return false; 1017 } 1018 return true; 1019 } 1020 1021 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1022 // the stride 64 versions. 1023 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1024 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1025 if (Modify) { 1026 CI.Offset = EltOffset0 / 64; 1027 Paired.Offset = EltOffset1 / 64; 1028 CI.UseST64 = true; 1029 } 1030 return true; 1031 } 1032 1033 // Check if the new offsets fit in the reduced 8-bit range. 1034 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1035 if (Modify) { 1036 CI.Offset = EltOffset0; 1037 Paired.Offset = EltOffset1; 1038 } 1039 return true; 1040 } 1041 1042 // Try to shift base address to decrease offsets. 1043 uint32_t Min = std::min(EltOffset0, EltOffset1); 1044 uint32_t Max = std::max(EltOffset0, EltOffset1); 1045 1046 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1047 if (((Max - Min) & ~Mask) == 0) { 1048 if (Modify) { 1049 // From the range of values we could use for BaseOff, choose the one that 1050 // is aligned to the highest power of two, to maximise the chance that 1051 // the same offset can be reused for other load/store pairs. 1052 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1053 // Copy the low bits of the offsets, so that when we adjust them by 1054 // subtracting BaseOff they will be multiples of 64. 1055 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1056 CI.BaseOff = BaseOff * CI.EltSize; 1057 CI.Offset = (EltOffset0 - BaseOff) / 64; 1058 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1059 CI.UseST64 = true; 1060 } 1061 return true; 1062 } 1063 1064 if (isUInt<8>(Max - Min)) { 1065 if (Modify) { 1066 // From the range of values we could use for BaseOff, choose the one that 1067 // is aligned to the highest power of two, to maximise the chance that 1068 // the same offset can be reused for other load/store pairs. 1069 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1070 CI.BaseOff = BaseOff * CI.EltSize; 1071 CI.Offset = EltOffset0 - BaseOff; 1072 Paired.Offset = EltOffset1 - BaseOff; 1073 } 1074 return true; 1075 } 1076 1077 return false; 1078 } 1079 1080 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1081 const CombineInfo &CI, 1082 const CombineInfo &Paired) { 1083 const unsigned Width = (CI.Width + Paired.Width); 1084 switch (CI.InstClass) { 1085 default: 1086 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1087 case S_BUFFER_LOAD_IMM: 1088 case S_BUFFER_LOAD_SGPR_IMM: 1089 case S_LOAD_IMM: 1090 switch (Width) { 1091 default: 1092 return false; 1093 case 2: 1094 case 4: 1095 case 8: 1096 return true; 1097 case 3: 1098 return STM.hasScalarDwordx3Loads(); 1099 } 1100 } 1101 } 1102 1103 const TargetRegisterClass * 1104 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1105 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1106 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1107 } 1108 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1109 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1110 } 1111 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1112 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1113 } 1114 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1115 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1116 } 1117 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1118 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1119 } 1120 return nullptr; 1121 } 1122 1123 /// This function assumes that CI comes before Paired in a basic block. Return 1124 /// an insertion point for the merged instruction or nullptr on failure. 1125 SILoadStoreOptimizer::CombineInfo * 1126 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1127 CombineInfo &Paired) { 1128 // If another instruction has already been merged into CI, it may now be a 1129 // type that we can't do any further merging into. 1130 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1131 return nullptr; 1132 assert(CI.InstClass == Paired.InstClass); 1133 1134 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1135 getInstSubclass(Paired.I->getOpcode(), *TII)) 1136 return nullptr; 1137 1138 // Check both offsets (or masks for MIMG) can be combined and fit in the 1139 // reduced range. 1140 if (CI.InstClass == MIMG) { 1141 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1142 return nullptr; 1143 } else { 1144 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1145 return nullptr; 1146 } 1147 1148 DenseSet<Register> RegDefs; 1149 DenseSet<Register> RegUses; 1150 CombineInfo *Where; 1151 if (CI.I->mayLoad()) { 1152 // Try to hoist Paired up to CI. 1153 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1154 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1155 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1156 return nullptr; 1157 } 1158 Where = &CI; 1159 } else { 1160 // Try to sink CI down to Paired. 1161 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1162 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1163 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1164 return nullptr; 1165 } 1166 Where = &Paired; 1167 } 1168 1169 // Call offsetsCanBeCombined with modify = true so that the offsets are 1170 // correct for the new instruction. This should return true, because 1171 // this function should only be called on CombineInfo objects that 1172 // have already been confirmed to be mergeable. 1173 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1174 offsetsCanBeCombined(CI, *STM, Paired, true); 1175 return Where; 1176 } 1177 1178 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1179 if (STM->ldsRequiresM0Init()) 1180 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1181 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1182 } 1183 1184 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1185 if (STM->ldsRequiresM0Init()) 1186 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1187 1188 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1189 : AMDGPU::DS_READ2ST64_B64_gfx9; 1190 } 1191 1192 MachineBasicBlock::iterator 1193 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1194 MachineBasicBlock::iterator InsertBefore) { 1195 MachineBasicBlock *MBB = CI.I->getParent(); 1196 1197 // Be careful, since the addresses could be subregisters themselves in weird 1198 // cases, like vectors of pointers. 1199 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1200 1201 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1202 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1203 1204 unsigned NewOffset0 = CI.Offset; 1205 unsigned NewOffset1 = Paired.Offset; 1206 unsigned Opc = 1207 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1208 1209 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1210 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1211 1212 if (NewOffset0 > NewOffset1) { 1213 // Canonicalize the merged instruction so the smaller offset comes first. 1214 std::swap(NewOffset0, NewOffset1); 1215 std::swap(SubRegIdx0, SubRegIdx1); 1216 } 1217 1218 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1219 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1220 1221 const MCInstrDesc &Read2Desc = TII->get(Opc); 1222 1223 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1224 Register DestReg = MRI->createVirtualRegister(SuperRC); 1225 1226 DebugLoc DL = CI.I->getDebugLoc(); 1227 1228 Register BaseReg = AddrReg->getReg(); 1229 unsigned BaseSubReg = AddrReg->getSubReg(); 1230 unsigned BaseRegFlags = 0; 1231 if (CI.BaseOff) { 1232 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1233 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1234 .addImm(CI.BaseOff); 1235 1236 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1237 BaseRegFlags = RegState::Kill; 1238 1239 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1240 .addReg(ImmReg) 1241 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1242 .addImm(0); // clamp bit 1243 BaseSubReg = 0; 1244 } 1245 1246 MachineInstrBuilder Read2 = 1247 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1248 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1249 .addImm(NewOffset0) // offset0 1250 .addImm(NewOffset1) // offset1 1251 .addImm(0) // gds 1252 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1253 1254 (void)Read2; 1255 1256 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1257 1258 // Copy to the old destination registers. 1259 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1260 .add(*Dest0) // Copy to same destination including flags and sub reg. 1261 .addReg(DestReg, 0, SubRegIdx0); 1262 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1263 .add(*Dest1) 1264 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1265 1266 CI.I->eraseFromParent(); 1267 Paired.I->eraseFromParent(); 1268 1269 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1270 return Read2; 1271 } 1272 1273 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1274 if (STM->ldsRequiresM0Init()) 1275 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1276 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1277 : AMDGPU::DS_WRITE2_B64_gfx9; 1278 } 1279 1280 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1281 if (STM->ldsRequiresM0Init()) 1282 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1283 : AMDGPU::DS_WRITE2ST64_B64; 1284 1285 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1286 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1287 } 1288 1289 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1290 CombineInfo &CI, CombineInfo &Paired, 1291 MachineBasicBlock::iterator InsertBefore) { 1292 MachineBasicBlock *MBB = CI.I->getParent(); 1293 1294 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1295 // sure we preserve the subregister index and any register flags set on them. 1296 const MachineOperand *AddrReg = 1297 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1298 const MachineOperand *Data0 = 1299 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1300 const MachineOperand *Data1 = 1301 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1302 1303 unsigned NewOffset0 = CI.Offset; 1304 unsigned NewOffset1 = Paired.Offset; 1305 unsigned Opc = 1306 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1307 1308 if (NewOffset0 > NewOffset1) { 1309 // Canonicalize the merged instruction so the smaller offset comes first. 1310 std::swap(NewOffset0, NewOffset1); 1311 std::swap(Data0, Data1); 1312 } 1313 1314 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1315 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1316 1317 const MCInstrDesc &Write2Desc = TII->get(Opc); 1318 DebugLoc DL = CI.I->getDebugLoc(); 1319 1320 Register BaseReg = AddrReg->getReg(); 1321 unsigned BaseSubReg = AddrReg->getSubReg(); 1322 unsigned BaseRegFlags = 0; 1323 if (CI.BaseOff) { 1324 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1325 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1326 .addImm(CI.BaseOff); 1327 1328 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1329 BaseRegFlags = RegState::Kill; 1330 1331 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1332 .addReg(ImmReg) 1333 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1334 .addImm(0); // clamp bit 1335 BaseSubReg = 0; 1336 } 1337 1338 MachineInstrBuilder Write2 = 1339 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1340 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1341 .add(*Data0) // data0 1342 .add(*Data1) // data1 1343 .addImm(NewOffset0) // offset0 1344 .addImm(NewOffset1) // offset1 1345 .addImm(0) // gds 1346 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1347 1348 CI.I->eraseFromParent(); 1349 Paired.I->eraseFromParent(); 1350 1351 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1352 return Write2; 1353 } 1354 1355 MachineBasicBlock::iterator 1356 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1357 MachineBasicBlock::iterator InsertBefore) { 1358 MachineBasicBlock *MBB = CI.I->getParent(); 1359 DebugLoc DL = CI.I->getDebugLoc(); 1360 const unsigned Opcode = getNewOpcode(CI, Paired); 1361 1362 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1363 1364 Register DestReg = MRI->createVirtualRegister(SuperRC); 1365 unsigned MergedDMask = CI.DMask | Paired.DMask; 1366 unsigned DMaskIdx = 1367 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1368 1369 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1370 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1371 if (I == DMaskIdx) 1372 MIB.addImm(MergedDMask); 1373 else 1374 MIB.add((*CI.I).getOperand(I)); 1375 } 1376 1377 // It shouldn't be possible to get this far if the two instructions 1378 // don't have a single memoperand, because MachineInstr::mayAlias() 1379 // will return true if this is the case. 1380 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1381 1382 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1383 1384 unsigned SubRegIdx0, SubRegIdx1; 1385 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1386 1387 // Copy to the old destination registers. 1388 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1389 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1390 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1391 1392 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1393 .add(*Dest0) // Copy to same destination including flags and sub reg. 1394 .addReg(DestReg, 0, SubRegIdx0); 1395 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1396 .add(*Dest1) 1397 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1398 1399 CI.I->eraseFromParent(); 1400 Paired.I->eraseFromParent(); 1401 return New; 1402 } 1403 1404 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1405 CombineInfo &CI, CombineInfo &Paired, 1406 MachineBasicBlock::iterator InsertBefore) { 1407 MachineBasicBlock *MBB = CI.I->getParent(); 1408 DebugLoc DL = CI.I->getDebugLoc(); 1409 const unsigned Opcode = getNewOpcode(CI, Paired); 1410 1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1412 1413 Register DestReg = MRI->createVirtualRegister(SuperRC); 1414 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1415 1416 // It shouldn't be possible to get this far if the two instructions 1417 // don't have a single memoperand, because MachineInstr::mayAlias() 1418 // will return true if this is the case. 1419 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1420 1421 MachineInstrBuilder New = 1422 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1423 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1424 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1425 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1426 New.addImm(MergedOffset); 1427 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1428 1429 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1430 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1431 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1432 1433 // Copy to the old destination registers. 1434 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1435 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1436 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1437 1438 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1439 .add(*Dest0) // Copy to same destination including flags and sub reg. 1440 .addReg(DestReg, 0, SubRegIdx0); 1441 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1442 .add(*Dest1) 1443 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1444 1445 CI.I->eraseFromParent(); 1446 Paired.I->eraseFromParent(); 1447 return New; 1448 } 1449 1450 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1451 CombineInfo &CI, CombineInfo &Paired, 1452 MachineBasicBlock::iterator InsertBefore) { 1453 MachineBasicBlock *MBB = CI.I->getParent(); 1454 DebugLoc DL = CI.I->getDebugLoc(); 1455 1456 const unsigned Opcode = getNewOpcode(CI, Paired); 1457 1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1459 1460 // Copy to the new source register. 1461 Register DestReg = MRI->createVirtualRegister(SuperRC); 1462 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1463 1464 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1465 1466 AddressRegs Regs = getRegs(Opcode, *TII); 1467 1468 if (Regs.VAddr) 1469 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1470 1471 // It shouldn't be possible to get this far if the two instructions 1472 // don't have a single memoperand, because MachineInstr::mayAlias() 1473 // will return true if this is the case. 1474 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1475 1476 MachineInstr *New = 1477 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1478 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1479 .addImm(MergedOffset) // offset 1480 .addImm(CI.CPol) // cpol 1481 .addImm(0) // swz 1482 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1483 1484 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1485 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1486 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1487 1488 // Copy to the old destination registers. 1489 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1490 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1491 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1492 1493 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1494 .add(*Dest0) // Copy to same destination including flags and sub reg. 1495 .addReg(DestReg, 0, SubRegIdx0); 1496 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1497 .add(*Dest1) 1498 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1499 1500 CI.I->eraseFromParent(); 1501 Paired.I->eraseFromParent(); 1502 return New; 1503 } 1504 1505 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1506 CombineInfo &CI, CombineInfo &Paired, 1507 MachineBasicBlock::iterator InsertBefore) { 1508 MachineBasicBlock *MBB = CI.I->getParent(); 1509 DebugLoc DL = CI.I->getDebugLoc(); 1510 1511 const unsigned Opcode = getNewOpcode(CI, Paired); 1512 1513 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1514 1515 // Copy to the new source register. 1516 Register DestReg = MRI->createVirtualRegister(SuperRC); 1517 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1518 1519 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1520 1521 AddressRegs Regs = getRegs(Opcode, *TII); 1522 1523 if (Regs.VAddr) 1524 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1525 1526 unsigned JoinedFormat = 1527 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1528 1529 // It shouldn't be possible to get this far if the two instructions 1530 // don't have a single memoperand, because MachineInstr::mayAlias() 1531 // will return true if this is the case. 1532 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1533 1534 MachineInstr *New = 1535 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1536 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1537 .addImm(MergedOffset) // offset 1538 .addImm(JoinedFormat) // format 1539 .addImm(CI.CPol) // cpol 1540 .addImm(0) // swz 1541 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1542 1543 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1544 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1545 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1546 1547 // Copy to the old destination registers. 1548 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1549 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1550 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1551 1552 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1553 .add(*Dest0) // Copy to same destination including flags and sub reg. 1554 .addReg(DestReg, 0, SubRegIdx0); 1555 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1556 .add(*Dest1) 1557 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1558 1559 CI.I->eraseFromParent(); 1560 Paired.I->eraseFromParent(); 1561 return New; 1562 } 1563 1564 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1565 CombineInfo &CI, CombineInfo &Paired, 1566 MachineBasicBlock::iterator InsertBefore) { 1567 MachineBasicBlock *MBB = CI.I->getParent(); 1568 DebugLoc DL = CI.I->getDebugLoc(); 1569 1570 const unsigned Opcode = getNewOpcode(CI, Paired); 1571 1572 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1573 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1574 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1575 1576 // Copy to the new source register. 1577 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1578 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1579 1580 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1581 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1582 1583 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1584 .add(*Src0) 1585 .addImm(SubRegIdx0) 1586 .add(*Src1) 1587 .addImm(SubRegIdx1); 1588 1589 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1590 .addReg(SrcReg, RegState::Kill); 1591 1592 AddressRegs Regs = getRegs(Opcode, *TII); 1593 1594 if (Regs.VAddr) 1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1596 1597 unsigned JoinedFormat = 1598 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1599 1600 // It shouldn't be possible to get this far if the two instructions 1601 // don't have a single memoperand, because MachineInstr::mayAlias() 1602 // will return true if this is the case. 1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1604 1605 MachineInstr *New = 1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1608 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1609 .addImm(JoinedFormat) // format 1610 .addImm(CI.CPol) // cpol 1611 .addImm(0) // swz 1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1613 1614 CI.I->eraseFromParent(); 1615 Paired.I->eraseFromParent(); 1616 return New; 1617 } 1618 1619 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1620 CombineInfo &CI, CombineInfo &Paired, 1621 MachineBasicBlock::iterator InsertBefore) { 1622 MachineBasicBlock *MBB = CI.I->getParent(); 1623 DebugLoc DL = CI.I->getDebugLoc(); 1624 1625 const unsigned Opcode = getNewOpcode(CI, Paired); 1626 1627 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1628 Register DestReg = MRI->createVirtualRegister(SuperRC); 1629 1630 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1631 1632 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1633 MIB.add(*SAddr); 1634 1635 MachineInstr *New = 1636 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1637 .addImm(std::min(CI.Offset, Paired.Offset)) 1638 .addImm(CI.CPol) 1639 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1640 1641 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1642 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1643 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1644 1645 // Copy to the old destination registers. 1646 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1647 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1648 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1649 1650 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1651 .add(*Dest0) // Copy to same destination including flags and sub reg. 1652 .addReg(DestReg, 0, SubRegIdx0); 1653 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1654 .add(*Dest1) 1655 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1656 1657 CI.I->eraseFromParent(); 1658 Paired.I->eraseFromParent(); 1659 return New; 1660 } 1661 1662 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1663 CombineInfo &CI, CombineInfo &Paired, 1664 MachineBasicBlock::iterator InsertBefore) { 1665 MachineBasicBlock *MBB = CI.I->getParent(); 1666 DebugLoc DL = CI.I->getDebugLoc(); 1667 1668 const unsigned Opcode = getNewOpcode(CI, Paired); 1669 1670 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1671 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1672 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1673 1674 // Copy to the new source register. 1675 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1676 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1677 1678 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1679 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1680 1681 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1682 .add(*Src0) 1683 .addImm(SubRegIdx0) 1684 .add(*Src1) 1685 .addImm(SubRegIdx1); 1686 1687 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1688 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1689 .addReg(SrcReg, RegState::Kill); 1690 1691 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1692 MIB.add(*SAddr); 1693 1694 MachineInstr *New = 1695 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1696 .addImm(CI.CPol) 1697 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1698 1699 CI.I->eraseFromParent(); 1700 Paired.I->eraseFromParent(); 1701 return New; 1702 } 1703 1704 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1705 const CombineInfo &Paired) { 1706 const unsigned Width = CI.Width + Paired.Width; 1707 1708 switch (getCommonInstClass(CI, Paired)) { 1709 default: 1710 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1711 // FIXME: Handle d16 correctly 1712 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1713 Width); 1714 case TBUFFER_LOAD: 1715 case TBUFFER_STORE: 1716 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1717 Width); 1718 1719 case UNKNOWN: 1720 llvm_unreachable("Unknown instruction class"); 1721 case S_BUFFER_LOAD_IMM: 1722 switch (Width) { 1723 default: 1724 return 0; 1725 case 2: 1726 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1727 case 3: 1728 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1729 case 4: 1730 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1731 case 8: 1732 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1733 } 1734 case S_BUFFER_LOAD_SGPR_IMM: 1735 switch (Width) { 1736 default: 1737 return 0; 1738 case 2: 1739 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1740 case 3: 1741 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1742 case 4: 1743 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1744 case 8: 1745 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1746 } 1747 case S_LOAD_IMM: 1748 switch (Width) { 1749 default: 1750 return 0; 1751 case 2: 1752 return AMDGPU::S_LOAD_DWORDX2_IMM; 1753 case 3: 1754 return AMDGPU::S_LOAD_DWORDX3_IMM; 1755 case 4: 1756 return AMDGPU::S_LOAD_DWORDX4_IMM; 1757 case 8: 1758 return AMDGPU::S_LOAD_DWORDX8_IMM; 1759 } 1760 case GLOBAL_LOAD: 1761 switch (Width) { 1762 default: 1763 return 0; 1764 case 2: 1765 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1766 case 3: 1767 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1768 case 4: 1769 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1770 } 1771 case GLOBAL_LOAD_SADDR: 1772 switch (Width) { 1773 default: 1774 return 0; 1775 case 2: 1776 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1777 case 3: 1778 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1779 case 4: 1780 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1781 } 1782 case GLOBAL_STORE: 1783 switch (Width) { 1784 default: 1785 return 0; 1786 case 2: 1787 return AMDGPU::GLOBAL_STORE_DWORDX2; 1788 case 3: 1789 return AMDGPU::GLOBAL_STORE_DWORDX3; 1790 case 4: 1791 return AMDGPU::GLOBAL_STORE_DWORDX4; 1792 } 1793 case GLOBAL_STORE_SADDR: 1794 switch (Width) { 1795 default: 1796 return 0; 1797 case 2: 1798 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1799 case 3: 1800 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1801 case 4: 1802 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1803 } 1804 case FLAT_LOAD: 1805 switch (Width) { 1806 default: 1807 return 0; 1808 case 2: 1809 return AMDGPU::FLAT_LOAD_DWORDX2; 1810 case 3: 1811 return AMDGPU::FLAT_LOAD_DWORDX3; 1812 case 4: 1813 return AMDGPU::FLAT_LOAD_DWORDX4; 1814 } 1815 case FLAT_STORE: 1816 switch (Width) { 1817 default: 1818 return 0; 1819 case 2: 1820 return AMDGPU::FLAT_STORE_DWORDX2; 1821 case 3: 1822 return AMDGPU::FLAT_STORE_DWORDX3; 1823 case 4: 1824 return AMDGPU::FLAT_STORE_DWORDX4; 1825 } 1826 case MIMG: 1827 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1828 "No overlaps"); 1829 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1830 } 1831 } 1832 1833 std::pair<unsigned, unsigned> 1834 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1835 const CombineInfo &Paired) { 1836 assert((CI.InstClass != MIMG || 1837 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1838 CI.Width + Paired.Width)) && 1839 "No overlaps"); 1840 1841 unsigned Idx0; 1842 unsigned Idx1; 1843 1844 static const unsigned Idxs[5][4] = { 1845 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1846 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1847 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1848 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1849 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1850 }; 1851 1852 assert(CI.Width >= 1 && CI.Width <= 4); 1853 assert(Paired.Width >= 1 && Paired.Width <= 4); 1854 1855 if (Paired < CI) { 1856 Idx1 = Idxs[0][Paired.Width - 1]; 1857 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1858 } else { 1859 Idx0 = Idxs[0][CI.Width - 1]; 1860 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1861 } 1862 1863 return std::pair(Idx0, Idx1); 1864 } 1865 1866 const TargetRegisterClass * 1867 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1868 const CombineInfo &Paired) { 1869 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1870 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1871 switch (CI.Width + Paired.Width) { 1872 default: 1873 return nullptr; 1874 case 2: 1875 return &AMDGPU::SReg_64_XEXECRegClass; 1876 case 3: 1877 return &AMDGPU::SGPR_96RegClass; 1878 case 4: 1879 return &AMDGPU::SGPR_128RegClass; 1880 case 8: 1881 return &AMDGPU::SGPR_256RegClass; 1882 case 16: 1883 return &AMDGPU::SGPR_512RegClass; 1884 } 1885 } 1886 1887 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1888 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1889 ? TRI->getAGPRClassForBitWidth(BitWidth) 1890 : TRI->getVGPRClassForBitWidth(BitWidth); 1891 } 1892 1893 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1894 CombineInfo &CI, CombineInfo &Paired, 1895 MachineBasicBlock::iterator InsertBefore) { 1896 MachineBasicBlock *MBB = CI.I->getParent(); 1897 DebugLoc DL = CI.I->getDebugLoc(); 1898 1899 const unsigned Opcode = getNewOpcode(CI, Paired); 1900 1901 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1902 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1903 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1904 1905 // Copy to the new source register. 1906 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1907 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1908 1909 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1910 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1911 1912 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1913 .add(*Src0) 1914 .addImm(SubRegIdx0) 1915 .add(*Src1) 1916 .addImm(SubRegIdx1); 1917 1918 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1919 .addReg(SrcReg, RegState::Kill); 1920 1921 AddressRegs Regs = getRegs(Opcode, *TII); 1922 1923 if (Regs.VAddr) 1924 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1925 1926 1927 // It shouldn't be possible to get this far if the two instructions 1928 // don't have a single memoperand, because MachineInstr::mayAlias() 1929 // will return true if this is the case. 1930 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1931 1932 MachineInstr *New = 1933 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1934 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1935 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1936 .addImm(CI.CPol) // cpol 1937 .addImm(0) // swz 1938 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1939 1940 CI.I->eraseFromParent(); 1941 Paired.I->eraseFromParent(); 1942 return New; 1943 } 1944 1945 MachineOperand 1946 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1947 APInt V(32, Val, true); 1948 if (TII->isInlineConstant(V)) 1949 return MachineOperand::CreateImm(Val); 1950 1951 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1952 MachineInstr *Mov = 1953 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1954 TII->get(AMDGPU::S_MOV_B32), Reg) 1955 .addImm(Val); 1956 (void)Mov; 1957 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1958 return MachineOperand::CreateReg(Reg, false); 1959 } 1960 1961 // Compute base address using Addr and return the final register. 1962 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1963 const MemAddress &Addr) const { 1964 MachineBasicBlock *MBB = MI.getParent(); 1965 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1966 DebugLoc DL = MI.getDebugLoc(); 1967 1968 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1969 Addr.Base.LoSubReg) && 1970 "Expected 32-bit Base-Register-Low!!"); 1971 1972 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1973 Addr.Base.HiSubReg) && 1974 "Expected 32-bit Base-Register-Hi!!"); 1975 1976 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1977 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1978 MachineOperand OffsetHi = 1979 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1980 1981 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1982 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1983 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1984 1985 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1986 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1987 MachineInstr *LoHalf = 1988 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1989 .addReg(CarryReg, RegState::Define) 1990 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1991 .add(OffsetLo) 1992 .addImm(0); // clamp bit 1993 (void)LoHalf; 1994 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1995 1996 MachineInstr *HiHalf = 1997 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1998 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1999 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2000 .add(OffsetHi) 2001 .addReg(CarryReg, RegState::Kill) 2002 .addImm(0); // clamp bit 2003 (void)HiHalf; 2004 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2005 2006 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2007 MachineInstr *FullBase = 2008 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2009 .addReg(DestSub0) 2010 .addImm(AMDGPU::sub0) 2011 .addReg(DestSub1) 2012 .addImm(AMDGPU::sub1); 2013 (void)FullBase; 2014 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2015 2016 return FullDestReg; 2017 } 2018 2019 // Update base and offset with the NewBase and NewOffset in MI. 2020 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2021 Register NewBase, 2022 int32_t NewOffset) const { 2023 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2024 Base->setReg(NewBase); 2025 Base->setIsKill(false); 2026 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2027 } 2028 2029 std::optional<int32_t> 2030 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2031 if (Op.isImm()) 2032 return Op.getImm(); 2033 2034 if (!Op.isReg()) 2035 return std::nullopt; 2036 2037 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2038 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2039 !Def->getOperand(1).isImm()) 2040 return std::nullopt; 2041 2042 return Def->getOperand(1).getImm(); 2043 } 2044 2045 // Analyze Base and extracts: 2046 // - 32bit base registers, subregisters 2047 // - 64bit constant offset 2048 // Expecting base computation as: 2049 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2050 // %LO:vgpr_32, %c:sreg_64_xexec = 2051 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2052 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2053 // %Base:vreg_64 = 2054 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2055 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2056 MemAddress &Addr) const { 2057 if (!Base.isReg()) 2058 return; 2059 2060 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2061 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2062 || Def->getNumOperands() != 5) 2063 return; 2064 2065 MachineOperand BaseLo = Def->getOperand(1); 2066 MachineOperand BaseHi = Def->getOperand(3); 2067 if (!BaseLo.isReg() || !BaseHi.isReg()) 2068 return; 2069 2070 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2071 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2072 2073 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2074 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2075 return; 2076 2077 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2078 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2079 2080 auto Offset0P = extractConstOffset(*Src0); 2081 if (Offset0P) 2082 BaseLo = *Src1; 2083 else { 2084 if (!(Offset0P = extractConstOffset(*Src1))) 2085 return; 2086 BaseLo = *Src0; 2087 } 2088 2089 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2090 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2091 2092 if (Src0->isImm()) 2093 std::swap(Src0, Src1); 2094 2095 if (!Src1->isImm()) 2096 return; 2097 2098 uint64_t Offset1 = Src1->getImm(); 2099 BaseHi = *Src0; 2100 2101 Addr.Base.LoReg = BaseLo.getReg(); 2102 Addr.Base.HiReg = BaseHi.getReg(); 2103 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2104 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2105 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2106 } 2107 2108 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2109 MachineInstr &MI, 2110 MemInfoMap &Visited, 2111 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2112 2113 if (!(MI.mayLoad() ^ MI.mayStore())) 2114 return false; 2115 2116 // TODO: Support flat and scratch. 2117 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2118 return false; 2119 2120 if (MI.mayLoad() && 2121 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2122 return false; 2123 2124 if (AnchorList.count(&MI)) 2125 return false; 2126 2127 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2128 2129 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2130 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2131 return false; 2132 } 2133 2134 // Step1: Find the base-registers and a 64bit constant offset. 2135 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2136 MemAddress MAddr; 2137 if (!Visited.contains(&MI)) { 2138 processBaseWithConstOffset(Base, MAddr); 2139 Visited[&MI] = MAddr; 2140 } else 2141 MAddr = Visited[&MI]; 2142 2143 if (MAddr.Offset == 0) { 2144 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2145 " constant offsets that can be promoted.\n";); 2146 return false; 2147 } 2148 2149 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2150 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2151 2152 // Step2: Traverse through MI's basic block and find an anchor(that has the 2153 // same base-registers) with the highest 13bit distance from MI's offset. 2154 // E.g. (64bit loads) 2155 // bb: 2156 // addr1 = &a + 4096; load1 = load(addr1, 0) 2157 // addr2 = &a + 6144; load2 = load(addr2, 0) 2158 // addr3 = &a + 8192; load3 = load(addr3, 0) 2159 // addr4 = &a + 10240; load4 = load(addr4, 0) 2160 // addr5 = &a + 12288; load5 = load(addr5, 0) 2161 // 2162 // Starting from the first load, the optimization will try to find a new base 2163 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2164 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2165 // as the new-base(anchor) because of the maximum distance which can 2166 // accommodate more intermediate bases presumably. 2167 // 2168 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2169 // (&a + 8192) for load1, load2, load4. 2170 // addr = &a + 8192 2171 // load1 = load(addr, -4096) 2172 // load2 = load(addr, -2048) 2173 // load3 = load(addr, 0) 2174 // load4 = load(addr, 2048) 2175 // addr5 = &a + 12288; load5 = load(addr5, 0) 2176 // 2177 MachineInstr *AnchorInst = nullptr; 2178 MemAddress AnchorAddr; 2179 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2180 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2181 2182 MachineBasicBlock *MBB = MI.getParent(); 2183 MachineBasicBlock::iterator E = MBB->end(); 2184 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2185 ++MBBI; 2186 const SITargetLowering *TLI = 2187 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2188 2189 for ( ; MBBI != E; ++MBBI) { 2190 MachineInstr &MINext = *MBBI; 2191 // TODO: Support finding an anchor(with same base) from store addresses or 2192 // any other load addresses where the opcodes are different. 2193 if (MINext.getOpcode() != MI.getOpcode() || 2194 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2195 continue; 2196 2197 const MachineOperand &BaseNext = 2198 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2199 MemAddress MAddrNext; 2200 if (!Visited.contains(&MINext)) { 2201 processBaseWithConstOffset(BaseNext, MAddrNext); 2202 Visited[&MINext] = MAddrNext; 2203 } else 2204 MAddrNext = Visited[&MINext]; 2205 2206 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2207 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2208 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2209 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2210 continue; 2211 2212 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2213 2214 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2215 TargetLoweringBase::AddrMode AM; 2216 AM.HasBaseReg = true; 2217 AM.BaseOffs = Dist; 2218 if (TLI->isLegalGlobalAddressingMode(AM) && 2219 (uint32_t)std::abs(Dist) > MaxDist) { 2220 MaxDist = std::abs(Dist); 2221 2222 AnchorAddr = MAddrNext; 2223 AnchorInst = &MINext; 2224 } 2225 } 2226 2227 if (AnchorInst) { 2228 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2229 AnchorInst->dump()); 2230 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2231 << AnchorAddr.Offset << "\n\n"); 2232 2233 // Instead of moving up, just re-compute anchor-instruction's base address. 2234 Register Base = computeBase(MI, AnchorAddr); 2235 2236 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2237 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2238 2239 for (auto P : InstsWCommonBase) { 2240 TargetLoweringBase::AddrMode AM; 2241 AM.HasBaseReg = true; 2242 AM.BaseOffs = P.second - AnchorAddr.Offset; 2243 2244 if (TLI->isLegalGlobalAddressingMode(AM)) { 2245 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2246 dbgs() << ")"; P.first->dump()); 2247 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2248 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2249 } 2250 } 2251 AnchorList.insert(AnchorInst); 2252 return true; 2253 } 2254 2255 return false; 2256 } 2257 2258 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2259 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2260 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2261 if (AddrList.front().InstClass == CI.InstClass && 2262 AddrList.front().IsAGPR == CI.IsAGPR && 2263 AddrList.front().hasSameBaseAddress(CI)) { 2264 AddrList.emplace_back(CI); 2265 return; 2266 } 2267 } 2268 2269 // Base address not found, so add a new list. 2270 MergeableInsts.emplace_back(1, CI); 2271 } 2272 2273 std::pair<MachineBasicBlock::iterator, bool> 2274 SILoadStoreOptimizer::collectMergeableInsts( 2275 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2276 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2277 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2278 bool Modified = false; 2279 2280 // Sort potential mergeable instructions into lists. One list per base address. 2281 unsigned Order = 0; 2282 MachineBasicBlock::iterator BlockI = Begin; 2283 for (; BlockI != End; ++BlockI) { 2284 MachineInstr &MI = *BlockI; 2285 2286 // We run this before checking if an address is mergeable, because it can produce 2287 // better code even if the instructions aren't mergeable. 2288 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2289 Modified = true; 2290 2291 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2292 // barriers. We can look after this barrier for separate merges. 2293 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2294 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2295 2296 // Search will resume after this instruction in a separate merge list. 2297 ++BlockI; 2298 break; 2299 } 2300 2301 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2302 if (InstClass == UNKNOWN) 2303 continue; 2304 2305 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2306 int Swizzled = 2307 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2308 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2309 continue; 2310 2311 CombineInfo CI; 2312 CI.setMI(MI, *this); 2313 CI.Order = Order++; 2314 2315 if (!CI.hasMergeableAddress(*MRI)) 2316 continue; 2317 2318 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2319 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2320 // operands. However we are reporting that ds_write2 shall have 2321 // only VGPR data so that machine copy propagation does not 2322 // create an illegal instruction with a VGPR and AGPR sources. 2323 // Consequenctially if we create such instruction the verifier 2324 // will complain. 2325 continue; 2326 } 2327 2328 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2329 2330 addInstToMergeableList(CI, MergeableInsts); 2331 } 2332 2333 // At this point we have lists of Mergeable instructions. 2334 // 2335 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2336 // list try to find an instruction that can be merged with I. If an instruction 2337 // is found, it is stored in the Paired field. If no instructions are found, then 2338 // the CombineInfo object is deleted from the list. 2339 2340 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2341 E = MergeableInsts.end(); I != E;) { 2342 2343 std::list<CombineInfo> &MergeList = *I; 2344 if (MergeList.size() <= 1) { 2345 // This means we have found only one instruction with a given address 2346 // that can be merged, and we need at least 2 instructions to do a merge, 2347 // so this list can be discarded. 2348 I = MergeableInsts.erase(I); 2349 continue; 2350 } 2351 2352 // Sort the lists by offsets, this way mergeable instructions will be 2353 // adjacent to each other in the list, which will make it easier to find 2354 // matches. 2355 MergeList.sort( 2356 [] (const CombineInfo &A, const CombineInfo &B) { 2357 return A.Offset < B.Offset; 2358 }); 2359 ++I; 2360 } 2361 2362 return std::pair(BlockI, Modified); 2363 } 2364 2365 // Scan through looking for adjacent LDS operations with constant offsets from 2366 // the same base register. We rely on the scheduler to do the hard work of 2367 // clustering nearby loads, and assume these are all adjacent. 2368 bool SILoadStoreOptimizer::optimizeBlock( 2369 std::list<std::list<CombineInfo> > &MergeableInsts) { 2370 bool Modified = false; 2371 2372 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2373 E = MergeableInsts.end(); I != E;) { 2374 std::list<CombineInfo> &MergeList = *I; 2375 2376 bool OptimizeListAgain = false; 2377 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2378 // We weren't able to make any changes, so delete the list so we don't 2379 // process the same instructions the next time we try to optimize this 2380 // block. 2381 I = MergeableInsts.erase(I); 2382 continue; 2383 } 2384 2385 Modified = true; 2386 2387 // We made changes, but also determined that there were no more optimization 2388 // opportunities, so we don't need to reprocess the list 2389 if (!OptimizeListAgain) { 2390 I = MergeableInsts.erase(I); 2391 continue; 2392 } 2393 OptimizeAgain = true; 2394 } 2395 return Modified; 2396 } 2397 2398 bool 2399 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2400 std::list<CombineInfo> &MergeList, 2401 bool &OptimizeListAgain) { 2402 if (MergeList.empty()) 2403 return false; 2404 2405 bool Modified = false; 2406 2407 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2408 Next = std::next(I)) { 2409 2410 auto First = I; 2411 auto Second = Next; 2412 2413 if ((*First).Order > (*Second).Order) 2414 std::swap(First, Second); 2415 CombineInfo &CI = *First; 2416 CombineInfo &Paired = *Second; 2417 2418 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2419 if (!Where) { 2420 ++I; 2421 continue; 2422 } 2423 2424 Modified = true; 2425 2426 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2427 2428 MachineBasicBlock::iterator NewMI; 2429 switch (CI.InstClass) { 2430 default: 2431 llvm_unreachable("unknown InstClass"); 2432 break; 2433 case DS_READ: 2434 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2435 break; 2436 case DS_WRITE: 2437 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2438 break; 2439 case S_BUFFER_LOAD_IMM: 2440 case S_BUFFER_LOAD_SGPR_IMM: 2441 case S_LOAD_IMM: 2442 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2443 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2444 break; 2445 case BUFFER_LOAD: 2446 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2447 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2448 break; 2449 case BUFFER_STORE: 2450 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2451 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2452 break; 2453 case MIMG: 2454 NewMI = mergeImagePair(CI, Paired, Where->I); 2455 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2456 break; 2457 case TBUFFER_LOAD: 2458 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2459 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2460 break; 2461 case TBUFFER_STORE: 2462 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2463 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2464 break; 2465 case FLAT_LOAD: 2466 case GLOBAL_LOAD: 2467 case GLOBAL_LOAD_SADDR: 2468 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2469 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2470 break; 2471 case FLAT_STORE: 2472 case GLOBAL_STORE: 2473 case GLOBAL_STORE_SADDR: 2474 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2475 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2476 break; 2477 } 2478 CI.setMI(NewMI, *this); 2479 CI.Order = Where->Order; 2480 if (I == Second) 2481 I = Next; 2482 2483 MergeList.erase(Second); 2484 } 2485 2486 return Modified; 2487 } 2488 2489 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2490 if (skipFunction(MF.getFunction())) 2491 return false; 2492 2493 STM = &MF.getSubtarget<GCNSubtarget>(); 2494 if (!STM->loadStoreOptEnabled()) 2495 return false; 2496 2497 TII = STM->getInstrInfo(); 2498 TRI = &TII->getRegisterInfo(); 2499 2500 MRI = &MF.getRegInfo(); 2501 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2502 2503 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2504 2505 bool Modified = false; 2506 2507 // Contains the list of instructions for which constant offsets are being 2508 // promoted to the IMM. This is tracked for an entire block at time. 2509 SmallPtrSet<MachineInstr *, 4> AnchorList; 2510 MemInfoMap Visited; 2511 2512 for (MachineBasicBlock &MBB : MF) { 2513 MachineBasicBlock::iterator SectionEnd; 2514 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2515 I = SectionEnd) { 2516 bool CollectModified; 2517 std::list<std::list<CombineInfo>> MergeableInsts; 2518 2519 // First pass: Collect list of all instructions we know how to merge in a 2520 // subset of the block. 2521 std::tie(SectionEnd, CollectModified) = 2522 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2523 2524 Modified |= CollectModified; 2525 2526 do { 2527 OptimizeAgain = false; 2528 Modified |= optimizeBlock(MergeableInsts); 2529 } while (OptimizeAgain); 2530 } 2531 2532 Visited.clear(); 2533 AnchorList.clear(); 2534 } 2535 2536 return Modified; 2537 } 2538