1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "SILoadStoreOptimizer.h" 61 #include "AMDGPU.h" 62 #include "GCNSubtarget.h" 63 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 64 #include "llvm/Analysis/AliasAnalysis.h" 65 #include "llvm/CodeGen/MachineFunctionPass.h" 66 #include "llvm/InitializePasses.h" 67 68 using namespace llvm; 69 70 #define DEBUG_TYPE "si-load-store-opt" 71 72 namespace { 73 enum InstClassEnum { 74 UNKNOWN, 75 DS_READ, 76 DS_WRITE, 77 S_BUFFER_LOAD_IMM, 78 S_BUFFER_LOAD_SGPR_IMM, 79 S_LOAD_IMM, 80 BUFFER_LOAD, 81 BUFFER_STORE, 82 MIMG, 83 TBUFFER_LOAD, 84 TBUFFER_STORE, 85 GLOBAL_LOAD_SADDR, 86 GLOBAL_STORE_SADDR, 87 FLAT_LOAD, 88 FLAT_STORE, 89 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 90 GLOBAL_STORE // any CombineInfo, they are only ever returned by 91 // getCommonInstClass. 92 }; 93 94 struct AddressRegs { 95 unsigned char NumVAddrs = 0; 96 bool SBase = false; 97 bool SRsrc = false; 98 bool SOffset = false; 99 bool SAddr = false; 100 bool VAddr = false; 101 bool Addr = false; 102 bool SSamp = false; 103 }; 104 105 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 106 const unsigned MaxAddressRegs = 12 + 1 + 1; 107 108 class SILoadStoreOptimizer { 109 struct CombineInfo { 110 MachineBasicBlock::iterator I; 111 unsigned EltSize; 112 unsigned Offset; 113 unsigned Width; 114 unsigned Format; 115 unsigned BaseOff; 116 unsigned DMask; 117 InstClassEnum InstClass; 118 unsigned CPol = 0; 119 bool IsAGPR; 120 bool UseST64; 121 int AddrIdx[MaxAddressRegs]; 122 const MachineOperand *AddrReg[MaxAddressRegs]; 123 unsigned NumAddresses; 124 unsigned Order; 125 126 bool hasSameBaseAddress(const CombineInfo &CI) { 127 if (NumAddresses != CI.NumAddresses) 128 return false; 129 130 const MachineInstr &MI = *CI.I; 131 for (unsigned i = 0; i < NumAddresses; i++) { 132 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 133 134 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 135 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 136 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 137 return false; 138 } 139 continue; 140 } 141 142 // Check same base pointer. Be careful of subregisters, which can occur 143 // with vectors of pointers. 144 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 146 return false; 147 } 148 } 149 return true; 150 } 151 152 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 153 for (unsigned i = 0; i < NumAddresses; ++i) { 154 const MachineOperand *AddrOp = AddrReg[i]; 155 // Immediates are always OK. 156 if (AddrOp->isImm()) 157 continue; 158 159 // Don't try to merge addresses that aren't either immediates or registers. 160 // TODO: Should be possible to merge FrameIndexes and maybe some other 161 // non-register 162 if (!AddrOp->isReg()) 163 return false; 164 165 // TODO: We should be able to merge instructions with other physical reg 166 // addresses too. 167 if (AddrOp->getReg().isPhysical() && 168 AddrOp->getReg() != AMDGPU::SGPR_NULL) 169 return false; 170 171 // If an address has only one use then there will be no other 172 // instructions with the same address, so we can't merge this one. 173 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 174 return false; 175 } 176 return true; 177 } 178 179 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 180 181 // Compare by pointer order. 182 bool operator<(const CombineInfo& Other) const { 183 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 184 } 185 }; 186 187 struct BaseRegisters { 188 Register LoReg; 189 Register HiReg; 190 191 unsigned LoSubReg = 0; 192 unsigned HiSubReg = 0; 193 }; 194 195 struct MemAddress { 196 BaseRegisters Base; 197 int64_t Offset = 0; 198 }; 199 200 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 201 202 private: 203 const GCNSubtarget *STM = nullptr; 204 const SIInstrInfo *TII = nullptr; 205 const SIRegisterInfo *TRI = nullptr; 206 MachineRegisterInfo *MRI = nullptr; 207 AliasAnalysis *AA = nullptr; 208 bool OptimizeAgain; 209 210 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 211 const DenseSet<Register> &ARegUses, 212 const MachineInstr &A, const MachineInstr &B) const; 213 static bool dmasksCanBeCombined(const CombineInfo &CI, 214 const SIInstrInfo &TII, 215 const CombineInfo &Paired); 216 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 217 CombineInfo &Paired, bool Modify = false); 218 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 219 const CombineInfo &Paired); 220 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 221 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 222 const CombineInfo &Paired); 223 const TargetRegisterClass * 224 getTargetRegisterClass(const CombineInfo &CI, 225 const CombineInfo &Paired) const; 226 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 227 228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 229 230 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired, 231 MachineBasicBlock::iterator InsertBefore, 232 AMDGPU::OpName OpName, Register DestReg) const; 233 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 234 MachineBasicBlock::iterator InsertBefore, 235 AMDGPU::OpName OpName) const; 236 237 unsigned read2Opcode(unsigned EltSize) const; 238 unsigned read2ST64Opcode(unsigned EltSize) const; 239 MachineBasicBlock::iterator 240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 241 MachineBasicBlock::iterator InsertBefore); 242 243 unsigned write2Opcode(unsigned EltSize) const; 244 unsigned write2ST64Opcode(unsigned EltSize) const; 245 MachineBasicBlock::iterator 246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 247 MachineBasicBlock::iterator InsertBefore); 248 MachineBasicBlock::iterator 249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 250 MachineBasicBlock::iterator InsertBefore); 251 MachineBasicBlock::iterator 252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 253 MachineBasicBlock::iterator InsertBefore); 254 MachineBasicBlock::iterator 255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 256 MachineBasicBlock::iterator InsertBefore); 257 MachineBasicBlock::iterator 258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 259 MachineBasicBlock::iterator InsertBefore); 260 MachineBasicBlock::iterator 261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 262 MachineBasicBlock::iterator InsertBefore); 263 MachineBasicBlock::iterator 264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 265 MachineBasicBlock::iterator InsertBefore); 266 MachineBasicBlock::iterator 267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 268 MachineBasicBlock::iterator InsertBefore); 269 MachineBasicBlock::iterator 270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 271 MachineBasicBlock::iterator InsertBefore); 272 273 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 274 int32_t NewOffset) const; 275 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 276 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 277 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 278 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 279 /// Promotes constant offset to the immediate by adjusting the base. It 280 /// tries to use a base from the nearby instructions that allows it to have 281 /// a 13bit constant offset which gets promoted to the immediate. 282 bool promoteConstantOffsetToImm(MachineInstr &CI, 283 MemInfoMap &Visited, 284 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 285 void addInstToMergeableList(const CombineInfo &CI, 286 std::list<std::list<CombineInfo> > &MergeableInsts) const; 287 288 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 289 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 290 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 291 std::list<std::list<CombineInfo>> &MergeableInsts) const; 292 293 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 294 const CombineInfo &Paired); 295 296 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 297 const CombineInfo &Paired); 298 299 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 300 bool &OptimizeListAgain); 301 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 302 303 public: 304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {} 305 bool run(MachineFunction &MF); 306 }; 307 308 class SILoadStoreOptimizerLegacy : public MachineFunctionPass { 309 public: 310 static char ID; 311 312 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {} 313 314 bool runOnMachineFunction(MachineFunction &MF) override; 315 316 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 317 318 void getAnalysisUsage(AnalysisUsage &AU) const override { 319 AU.setPreservesCFG(); 320 AU.addRequired<AAResultsWrapperPass>(); 321 322 MachineFunctionPass::getAnalysisUsage(AU); 323 } 324 325 MachineFunctionProperties getRequiredProperties() const override { 326 return MachineFunctionProperties().setIsSSA(); 327 } 328 }; 329 330 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 331 const unsigned Opc = MI.getOpcode(); 332 333 if (TII.isMUBUF(Opc)) { 334 // FIXME: Handle d16 correctly 335 return AMDGPU::getMUBUFElements(Opc); 336 } 337 if (TII.isImage(MI)) { 338 uint64_t DMaskImm = 339 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 340 return llvm::popcount(DMaskImm); 341 } 342 if (TII.isMTBUF(Opc)) { 343 return AMDGPU::getMTBUFElements(Opc); 344 } 345 346 switch (Opc) { 347 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 348 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 349 case AMDGPU::S_LOAD_DWORD_IMM: 350 case AMDGPU::GLOBAL_LOAD_DWORD: 351 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 352 case AMDGPU::GLOBAL_STORE_DWORD: 353 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 354 case AMDGPU::FLAT_LOAD_DWORD: 355 case AMDGPU::FLAT_STORE_DWORD: 356 return 1; 357 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 358 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 359 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 361 case AMDGPU::S_LOAD_DWORDX2_IMM: 362 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 363 case AMDGPU::GLOBAL_LOAD_DWORDX2: 364 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 365 case AMDGPU::GLOBAL_STORE_DWORDX2: 366 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 367 case AMDGPU::FLAT_LOAD_DWORDX2: 368 case AMDGPU::FLAT_STORE_DWORDX2: 369 return 2; 370 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 371 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 372 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 373 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 374 case AMDGPU::S_LOAD_DWORDX3_IMM: 375 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 376 case AMDGPU::GLOBAL_LOAD_DWORDX3: 377 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 378 case AMDGPU::GLOBAL_STORE_DWORDX3: 379 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 380 case AMDGPU::FLAT_LOAD_DWORDX3: 381 case AMDGPU::FLAT_STORE_DWORDX3: 382 return 3; 383 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 385 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 386 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 387 case AMDGPU::S_LOAD_DWORDX4_IMM: 388 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 389 case AMDGPU::GLOBAL_LOAD_DWORDX4: 390 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 391 case AMDGPU::GLOBAL_STORE_DWORDX4: 392 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 393 case AMDGPU::FLAT_LOAD_DWORDX4: 394 case AMDGPU::FLAT_STORE_DWORDX4: 395 return 4; 396 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 397 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 398 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 399 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 400 case AMDGPU::S_LOAD_DWORDX8_IMM: 401 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 402 return 8; 403 case AMDGPU::DS_READ_B32: 404 case AMDGPU::DS_READ_B32_gfx9: 405 case AMDGPU::DS_WRITE_B32: 406 case AMDGPU::DS_WRITE_B32_gfx9: 407 return 1; 408 case AMDGPU::DS_READ_B64: 409 case AMDGPU::DS_READ_B64_gfx9: 410 case AMDGPU::DS_WRITE_B64: 411 case AMDGPU::DS_WRITE_B64_gfx9: 412 return 2; 413 default: 414 return 0; 415 } 416 } 417 418 /// Maps instruction opcode to enum InstClassEnum. 419 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 420 switch (Opc) { 421 default: 422 if (TII.isMUBUF(Opc)) { 423 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 424 default: 425 return UNKNOWN; 426 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN: 427 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact: 428 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN: 429 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact: 430 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 431 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 432 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 433 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 434 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN: 435 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact: 436 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN: 437 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact: 438 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN: 439 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact: 440 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET: 441 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact: 442 return BUFFER_LOAD; 443 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN: 444 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact: 445 case AMDGPU::BUFFER_STORE_DWORD_IDXEN: 446 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact: 447 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 448 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 449 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 450 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 451 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN: 452 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact: 453 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN: 454 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact: 455 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN: 456 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact: 457 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET: 458 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact: 459 return BUFFER_STORE; 460 } 461 } 462 if (TII.isImage(Opc)) { 463 // Ignore instructions encoded without vaddr. 464 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 465 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 466 return UNKNOWN; 467 // Ignore BVH instructions 468 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 469 return UNKNOWN; 470 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 471 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 472 TII.isGather4(Opc)) 473 return UNKNOWN; 474 return MIMG; 475 } 476 if (TII.isMTBUF(Opc)) { 477 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 478 default: 479 return UNKNOWN; 480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 484 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 485 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 486 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 487 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 488 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN: 489 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact: 490 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN: 491 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact: 492 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN: 493 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact: 494 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET: 495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact: 496 return TBUFFER_LOAD; 497 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 498 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 499 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 500 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 501 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN: 502 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact: 503 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET: 504 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact: 505 return TBUFFER_STORE; 506 } 507 } 508 return UNKNOWN; 509 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 510 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 511 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 512 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 513 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 514 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 515 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 516 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 517 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 518 return S_BUFFER_LOAD_IMM; 519 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 520 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 521 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 522 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 523 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 524 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 525 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 526 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 527 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 528 return S_BUFFER_LOAD_SGPR_IMM; 529 case AMDGPU::S_LOAD_DWORD_IMM: 530 case AMDGPU::S_LOAD_DWORDX2_IMM: 531 case AMDGPU::S_LOAD_DWORDX3_IMM: 532 case AMDGPU::S_LOAD_DWORDX4_IMM: 533 case AMDGPU::S_LOAD_DWORDX8_IMM: 534 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 535 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 536 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 537 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 538 return S_LOAD_IMM; 539 case AMDGPU::DS_READ_B32: 540 case AMDGPU::DS_READ_B32_gfx9: 541 case AMDGPU::DS_READ_B64: 542 case AMDGPU::DS_READ_B64_gfx9: 543 return DS_READ; 544 case AMDGPU::DS_WRITE_B32: 545 case AMDGPU::DS_WRITE_B32_gfx9: 546 case AMDGPU::DS_WRITE_B64: 547 case AMDGPU::DS_WRITE_B64_gfx9: 548 return DS_WRITE; 549 case AMDGPU::GLOBAL_LOAD_DWORD: 550 case AMDGPU::GLOBAL_LOAD_DWORDX2: 551 case AMDGPU::GLOBAL_LOAD_DWORDX3: 552 case AMDGPU::GLOBAL_LOAD_DWORDX4: 553 case AMDGPU::FLAT_LOAD_DWORD: 554 case AMDGPU::FLAT_LOAD_DWORDX2: 555 case AMDGPU::FLAT_LOAD_DWORDX3: 556 case AMDGPU::FLAT_LOAD_DWORDX4: 557 return FLAT_LOAD; 558 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 559 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 560 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 561 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 562 return GLOBAL_LOAD_SADDR; 563 case AMDGPU::GLOBAL_STORE_DWORD: 564 case AMDGPU::GLOBAL_STORE_DWORDX2: 565 case AMDGPU::GLOBAL_STORE_DWORDX3: 566 case AMDGPU::GLOBAL_STORE_DWORDX4: 567 case AMDGPU::FLAT_STORE_DWORD: 568 case AMDGPU::FLAT_STORE_DWORDX2: 569 case AMDGPU::FLAT_STORE_DWORDX3: 570 case AMDGPU::FLAT_STORE_DWORDX4: 571 return FLAT_STORE; 572 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 573 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 574 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 575 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 576 return GLOBAL_STORE_SADDR; 577 } 578 } 579 580 /// Determines instruction subclass from opcode. Only instructions 581 /// of the same subclass can be merged together. The merged instruction may have 582 /// a different subclass but must have the same class. 583 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 584 switch (Opc) { 585 default: 586 if (TII.isMUBUF(Opc)) 587 return AMDGPU::getMUBUFBaseOpcode(Opc); 588 if (TII.isImage(Opc)) { 589 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 590 assert(Info); 591 return Info->BaseOpcode; 592 } 593 if (TII.isMTBUF(Opc)) 594 return AMDGPU::getMTBUFBaseOpcode(Opc); 595 return -1; 596 case AMDGPU::DS_READ_B32: 597 case AMDGPU::DS_READ_B32_gfx9: 598 case AMDGPU::DS_READ_B64: 599 case AMDGPU::DS_READ_B64_gfx9: 600 case AMDGPU::DS_WRITE_B32: 601 case AMDGPU::DS_WRITE_B32_gfx9: 602 case AMDGPU::DS_WRITE_B64: 603 case AMDGPU::DS_WRITE_B64_gfx9: 604 return Opc; 605 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 606 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 607 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 608 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 609 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 610 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 611 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 612 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 613 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 614 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 615 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 616 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 617 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 618 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 619 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 620 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 621 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 622 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 623 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 624 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 625 case AMDGPU::S_LOAD_DWORD_IMM: 626 case AMDGPU::S_LOAD_DWORDX2_IMM: 627 case AMDGPU::S_LOAD_DWORDX3_IMM: 628 case AMDGPU::S_LOAD_DWORDX4_IMM: 629 case AMDGPU::S_LOAD_DWORDX8_IMM: 630 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 631 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 632 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 633 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 634 return AMDGPU::S_LOAD_DWORD_IMM; 635 case AMDGPU::GLOBAL_LOAD_DWORD: 636 case AMDGPU::GLOBAL_LOAD_DWORDX2: 637 case AMDGPU::GLOBAL_LOAD_DWORDX3: 638 case AMDGPU::GLOBAL_LOAD_DWORDX4: 639 case AMDGPU::FLAT_LOAD_DWORD: 640 case AMDGPU::FLAT_LOAD_DWORDX2: 641 case AMDGPU::FLAT_LOAD_DWORDX3: 642 case AMDGPU::FLAT_LOAD_DWORDX4: 643 return AMDGPU::FLAT_LOAD_DWORD; 644 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 645 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 646 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 647 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 648 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 649 case AMDGPU::GLOBAL_STORE_DWORD: 650 case AMDGPU::GLOBAL_STORE_DWORDX2: 651 case AMDGPU::GLOBAL_STORE_DWORDX3: 652 case AMDGPU::GLOBAL_STORE_DWORDX4: 653 case AMDGPU::FLAT_STORE_DWORD: 654 case AMDGPU::FLAT_STORE_DWORDX2: 655 case AMDGPU::FLAT_STORE_DWORDX3: 656 case AMDGPU::FLAT_STORE_DWORDX4: 657 return AMDGPU::FLAT_STORE_DWORD; 658 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 659 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 660 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 661 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 662 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 663 } 664 } 665 666 // GLOBAL loads and stores are classified as FLAT initially. If both combined 667 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 668 // If either or both instructions are non segment specific FLAT the resulting 669 // combined operation will be FLAT, potentially promoting one of the GLOBAL 670 // operations to FLAT. 671 // For other instructions return the original unmodified class. 672 InstClassEnum 673 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 674 const CombineInfo &Paired) { 675 assert(CI.InstClass == Paired.InstClass); 676 677 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 678 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 679 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 680 681 return CI.InstClass; 682 } 683 684 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 685 AddressRegs Result; 686 687 if (TII.isMUBUF(Opc)) { 688 if (AMDGPU::getMUBUFHasVAddr(Opc)) 689 Result.VAddr = true; 690 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 691 Result.SRsrc = true; 692 if (AMDGPU::getMUBUFHasSoffset(Opc)) 693 Result.SOffset = true; 694 695 return Result; 696 } 697 698 if (TII.isImage(Opc)) { 699 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 700 if (VAddr0Idx >= 0) { 701 AMDGPU::OpName RsrcName = 702 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc; 703 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName); 704 Result.NumVAddrs = RsrcIdx - VAddr0Idx; 705 } else { 706 Result.VAddr = true; 707 } 708 Result.SRsrc = true; 709 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 710 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 711 Result.SSamp = true; 712 713 return Result; 714 } 715 if (TII.isMTBUF(Opc)) { 716 if (AMDGPU::getMTBUFHasVAddr(Opc)) 717 Result.VAddr = true; 718 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 719 Result.SRsrc = true; 720 if (AMDGPU::getMTBUFHasSoffset(Opc)) 721 Result.SOffset = true; 722 723 return Result; 724 } 725 726 switch (Opc) { 727 default: 728 return Result; 729 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 730 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 731 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: 732 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 733 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 734 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec: 735 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec: 736 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec: 737 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec: 738 Result.SOffset = true; 739 [[fallthrough]]; 740 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 741 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 742 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: 743 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 744 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 745 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec: 746 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec: 747 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec: 748 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec: 749 case AMDGPU::S_LOAD_DWORD_IMM: 750 case AMDGPU::S_LOAD_DWORDX2_IMM: 751 case AMDGPU::S_LOAD_DWORDX3_IMM: 752 case AMDGPU::S_LOAD_DWORDX4_IMM: 753 case AMDGPU::S_LOAD_DWORDX8_IMM: 754 case AMDGPU::S_LOAD_DWORDX2_IMM_ec: 755 case AMDGPU::S_LOAD_DWORDX3_IMM_ec: 756 case AMDGPU::S_LOAD_DWORDX4_IMM_ec: 757 case AMDGPU::S_LOAD_DWORDX8_IMM_ec: 758 Result.SBase = true; 759 return Result; 760 case AMDGPU::DS_READ_B32: 761 case AMDGPU::DS_READ_B64: 762 case AMDGPU::DS_READ_B32_gfx9: 763 case AMDGPU::DS_READ_B64_gfx9: 764 case AMDGPU::DS_WRITE_B32: 765 case AMDGPU::DS_WRITE_B64: 766 case AMDGPU::DS_WRITE_B32_gfx9: 767 case AMDGPU::DS_WRITE_B64_gfx9: 768 Result.Addr = true; 769 return Result; 770 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 771 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 772 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 773 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 774 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 775 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 776 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 777 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 778 Result.SAddr = true; 779 [[fallthrough]]; 780 case AMDGPU::GLOBAL_LOAD_DWORD: 781 case AMDGPU::GLOBAL_LOAD_DWORDX2: 782 case AMDGPU::GLOBAL_LOAD_DWORDX3: 783 case AMDGPU::GLOBAL_LOAD_DWORDX4: 784 case AMDGPU::GLOBAL_STORE_DWORD: 785 case AMDGPU::GLOBAL_STORE_DWORDX2: 786 case AMDGPU::GLOBAL_STORE_DWORDX3: 787 case AMDGPU::GLOBAL_STORE_DWORDX4: 788 case AMDGPU::FLAT_LOAD_DWORD: 789 case AMDGPU::FLAT_LOAD_DWORDX2: 790 case AMDGPU::FLAT_LOAD_DWORDX3: 791 case AMDGPU::FLAT_LOAD_DWORDX4: 792 case AMDGPU::FLAT_STORE_DWORD: 793 case AMDGPU::FLAT_STORE_DWORDX2: 794 case AMDGPU::FLAT_STORE_DWORDX3: 795 case AMDGPU::FLAT_STORE_DWORDX4: 796 Result.VAddr = true; 797 return Result; 798 } 799 } 800 801 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 802 const SILoadStoreOptimizer &LSO) { 803 I = MI; 804 unsigned Opc = MI->getOpcode(); 805 InstClass = getInstClass(Opc, *LSO.TII); 806 807 if (InstClass == UNKNOWN) 808 return; 809 810 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 811 812 switch (InstClass) { 813 case DS_READ: 814 EltSize = 815 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 816 : 4; 817 break; 818 case DS_WRITE: 819 EltSize = 820 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 821 : 4; 822 break; 823 case S_BUFFER_LOAD_IMM: 824 case S_BUFFER_LOAD_SGPR_IMM: 825 case S_LOAD_IMM: 826 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 827 break; 828 default: 829 EltSize = 4; 830 break; 831 } 832 833 if (InstClass == MIMG) { 834 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 835 // Offset is not considered for MIMG instructions. 836 Offset = 0; 837 } else { 838 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 839 Offset = I->getOperand(OffsetIdx).getImm(); 840 } 841 842 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 843 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 844 845 Width = getOpcodeWidth(*I, *LSO.TII); 846 847 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 848 Offset &= 0xffff; 849 } else if (InstClass != MIMG) { 850 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 851 } 852 853 AddressRegs Regs = getRegs(Opc, *LSO.TII); 854 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I); 855 856 NumAddresses = 0; 857 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 858 AddrIdx[NumAddresses++] = 859 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 860 if (Regs.Addr) 861 AddrIdx[NumAddresses++] = 862 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 863 if (Regs.SBase) 864 AddrIdx[NumAddresses++] = 865 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 866 if (Regs.SRsrc) 867 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 868 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc); 869 if (Regs.SOffset) 870 AddrIdx[NumAddresses++] = 871 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 872 if (Regs.SAddr) 873 AddrIdx[NumAddresses++] = 874 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 875 if (Regs.VAddr) 876 AddrIdx[NumAddresses++] = 877 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 878 if (Regs.SSamp) 879 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx( 880 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp); 881 assert(NumAddresses <= MaxAddressRegs); 882 883 for (unsigned J = 0; J < NumAddresses; J++) 884 AddrReg[J] = &I->getOperand(AddrIdx[J]); 885 } 886 887 } // end anonymous namespace. 888 889 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE, 890 "SI Load Store Optimizer", false, false) 891 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 892 INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE, 893 "SI Load Store Optimizer", false, false) 894 895 char SILoadStoreOptimizerLegacy::ID = 0; 896 897 char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID; 898 899 FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() { 900 return new SILoadStoreOptimizerLegacy(); 901 } 902 903 static void addDefsUsesToList(const MachineInstr &MI, 904 DenseSet<Register> &RegDefs, 905 DenseSet<Register> &RegUses) { 906 for (const auto &Op : MI.operands()) { 907 if (!Op.isReg()) 908 continue; 909 if (Op.isDef()) 910 RegDefs.insert(Op.getReg()); 911 if (Op.readsReg()) 912 RegUses.insert(Op.getReg()); 913 } 914 } 915 916 bool SILoadStoreOptimizer::canSwapInstructions( 917 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 918 const MachineInstr &A, const MachineInstr &B) const { 919 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 920 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 921 return false; 922 for (const auto &BOp : B.operands()) { 923 if (!BOp.isReg()) 924 continue; 925 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 926 return false; 927 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 928 return false; 929 } 930 return true; 931 } 932 933 // Given that \p CI and \p Paired are adjacent memory operations produce a new 934 // MMO for the combined operation with a new access size. 935 MachineMemOperand * 936 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 937 const CombineInfo &Paired) { 938 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 939 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 940 941 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue(); 942 943 // A base pointer for the combined operation is the same as the leading 944 // operation's pointer. 945 if (Paired < CI) 946 std::swap(MMOa, MMOb); 947 948 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 949 // If merging FLAT and GLOBAL set address space to FLAT. 950 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 951 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 952 953 MachineFunction *MF = CI.I->getMF(); 954 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 955 } 956 957 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 958 const SIInstrInfo &TII, 959 const CombineInfo &Paired) { 960 assert(CI.InstClass == MIMG); 961 962 // Ignore instructions with tfe/lwe set. 963 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 964 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 965 966 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 967 return false; 968 969 // Check other optional immediate operands for equality. 970 AMDGPU::OpName OperandsToMatch[] = { 971 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 972 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 973 974 for (AMDGPU::OpName op : OperandsToMatch) { 975 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 976 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 977 return false; 978 if (Idx != -1 && 979 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 980 return false; 981 } 982 983 // Check DMask for overlaps. 984 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 985 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 986 987 if (!MaxMask) 988 return false; 989 990 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 991 if ((1u << AllowedBitsForMin) <= MinMask) 992 return false; 993 994 return true; 995 } 996 997 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 998 unsigned ComponentCount, 999 const GCNSubtarget &STI) { 1000 if (ComponentCount > 4) 1001 return 0; 1002 1003 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 1004 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 1005 if (!OldFormatInfo) 1006 return 0; 1007 1008 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 1009 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 1010 ComponentCount, 1011 OldFormatInfo->NumFormat, STI); 1012 1013 if (!NewFormatInfo) 1014 return 0; 1015 1016 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 1017 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 1018 1019 return NewFormatInfo->Format; 1020 } 1021 1022 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 1023 // highest power of two. Note that the result is well defined for all inputs 1024 // including corner cases like: 1025 // - if Lo == Hi, return that value 1026 // - if Lo == 0, return 0 (even though the "- 1" below underflows 1027 // - if Lo > Hi, return 0 (as if the range wrapped around) 1028 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 1029 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 1030 } 1031 1032 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 1033 const GCNSubtarget &STI, 1034 CombineInfo &Paired, 1035 bool Modify) { 1036 assert(CI.InstClass != MIMG); 1037 1038 // XXX - Would the same offset be OK? Is there any reason this would happen or 1039 // be useful? 1040 if (CI.Offset == Paired.Offset) 1041 return false; 1042 1043 // This won't be valid if the offset isn't aligned. 1044 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 1045 return false; 1046 1047 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 1048 1049 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 1050 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 1051 if (!Info0) 1052 return false; 1053 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 1054 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 1055 if (!Info1) 1056 return false; 1057 1058 if (Info0->BitsPerComp != Info1->BitsPerComp || 1059 Info0->NumFormat != Info1->NumFormat) 1060 return false; 1061 1062 // TODO: Should be possible to support more formats, but if format loads 1063 // are not dword-aligned, the merged load might not be valid. 1064 if (Info0->BitsPerComp != 32) 1065 return false; 1066 1067 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 1068 return false; 1069 } 1070 1071 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 1072 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 1073 CI.UseST64 = false; 1074 CI.BaseOff = 0; 1075 1076 // Handle all non-DS instructions. 1077 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 1078 if (EltOffset0 + CI.Width != EltOffset1 && 1079 EltOffset1 + Paired.Width != EltOffset0) 1080 return false; 1081 if (CI.CPol != Paired.CPol) 1082 return false; 1083 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM || 1084 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) { 1085 // Reject cases like: 1086 // dword + dwordx2 -> dwordx3 1087 // dword + dwordx3 -> dwordx4 1088 // If we tried to combine these cases, we would fail to extract a subreg 1089 // for the result of the second load due to SGPR alignment requirements. 1090 if (CI.Width != Paired.Width && 1091 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset)) 1092 return false; 1093 } 1094 return true; 1095 } 1096 1097 // If the offset in elements doesn't fit in 8-bits, we might be able to use 1098 // the stride 64 versions. 1099 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 1100 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 1101 if (Modify) { 1102 CI.Offset = EltOffset0 / 64; 1103 Paired.Offset = EltOffset1 / 64; 1104 CI.UseST64 = true; 1105 } 1106 return true; 1107 } 1108 1109 // Check if the new offsets fit in the reduced 8-bit range. 1110 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 1111 if (Modify) { 1112 CI.Offset = EltOffset0; 1113 Paired.Offset = EltOffset1; 1114 } 1115 return true; 1116 } 1117 1118 // Try to shift base address to decrease offsets. 1119 uint32_t Min = std::min(EltOffset0, EltOffset1); 1120 uint32_t Max = std::max(EltOffset0, EltOffset1); 1121 1122 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1123 if (((Max - Min) & ~Mask) == 0) { 1124 if (Modify) { 1125 // From the range of values we could use for BaseOff, choose the one that 1126 // is aligned to the highest power of two, to maximise the chance that 1127 // the same offset can be reused for other load/store pairs. 1128 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1129 // Copy the low bits of the offsets, so that when we adjust them by 1130 // subtracting BaseOff they will be multiples of 64. 1131 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1132 CI.BaseOff = BaseOff * CI.EltSize; 1133 CI.Offset = (EltOffset0 - BaseOff) / 64; 1134 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1135 CI.UseST64 = true; 1136 } 1137 return true; 1138 } 1139 1140 if (isUInt<8>(Max - Min)) { 1141 if (Modify) { 1142 // From the range of values we could use for BaseOff, choose the one that 1143 // is aligned to the highest power of two, to maximise the chance that 1144 // the same offset can be reused for other load/store pairs. 1145 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1146 CI.BaseOff = BaseOff * CI.EltSize; 1147 CI.Offset = EltOffset0 - BaseOff; 1148 Paired.Offset = EltOffset1 - BaseOff; 1149 } 1150 return true; 1151 } 1152 1153 return false; 1154 } 1155 1156 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1157 const CombineInfo &CI, 1158 const CombineInfo &Paired) { 1159 const unsigned Width = (CI.Width + Paired.Width); 1160 switch (CI.InstClass) { 1161 default: 1162 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1163 case S_BUFFER_LOAD_IMM: 1164 case S_BUFFER_LOAD_SGPR_IMM: 1165 case S_LOAD_IMM: 1166 switch (Width) { 1167 default: 1168 return false; 1169 case 2: 1170 case 4: 1171 case 8: 1172 return true; 1173 case 3: 1174 return STM.hasScalarDwordx3Loads(); 1175 } 1176 } 1177 } 1178 1179 const TargetRegisterClass * 1180 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1181 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1182 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1183 } 1184 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1185 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1186 } 1187 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1188 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1189 } 1190 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1191 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1192 } 1193 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1194 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1195 } 1196 return nullptr; 1197 } 1198 1199 /// This function assumes that CI comes before Paired in a basic block. Return 1200 /// an insertion point for the merged instruction or nullptr on failure. 1201 SILoadStoreOptimizer::CombineInfo * 1202 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1203 CombineInfo &Paired) { 1204 // If another instruction has already been merged into CI, it may now be a 1205 // type that we can't do any further merging into. 1206 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1207 return nullptr; 1208 assert(CI.InstClass == Paired.InstClass); 1209 1210 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1211 getInstSubclass(Paired.I->getOpcode(), *TII)) 1212 return nullptr; 1213 1214 // Check both offsets (or masks for MIMG) can be combined and fit in the 1215 // reduced range. 1216 if (CI.InstClass == MIMG) { 1217 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1218 return nullptr; 1219 } else { 1220 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1221 return nullptr; 1222 } 1223 1224 DenseSet<Register> RegDefs; 1225 DenseSet<Register> RegUses; 1226 CombineInfo *Where; 1227 if (CI.I->mayLoad()) { 1228 // Try to hoist Paired up to CI. 1229 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1230 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1231 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1232 return nullptr; 1233 } 1234 Where = &CI; 1235 } else { 1236 // Try to sink CI down to Paired. 1237 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1238 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1239 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1240 return nullptr; 1241 } 1242 Where = &Paired; 1243 } 1244 1245 // Call offsetsCanBeCombined with modify = true so that the offsets are 1246 // correct for the new instruction. This should return true, because 1247 // this function should only be called on CombineInfo objects that 1248 // have already been confirmed to be mergeable. 1249 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1250 offsetsCanBeCombined(CI, *STM, Paired, true); 1251 return Where; 1252 } 1253 1254 // Copy the merged load result from DestReg to the original dest regs of CI and 1255 // Paired. 1256 void SILoadStoreOptimizer::copyToDestRegs( 1257 CombineInfo &CI, CombineInfo &Paired, 1258 MachineBasicBlock::iterator InsertBefore, AMDGPU::OpName OpName, 1259 Register DestReg) const { 1260 MachineBasicBlock *MBB = CI.I->getParent(); 1261 DebugLoc DL = CI.I->getDebugLoc(); 1262 1263 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1264 1265 // Copy to the old destination registers. 1266 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1267 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); 1268 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); 1269 1270 // The constrained sload instructions in S_LOAD_IMM class will have 1271 // `early-clobber` flag in the dst operand. Remove the flag before using the 1272 // MOs in copies. 1273 Dest0->setIsEarlyClobber(false); 1274 Dest1->setIsEarlyClobber(false); 1275 1276 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1277 .add(*Dest0) // Copy to same destination including flags and sub reg. 1278 .addReg(DestReg, 0, SubRegIdx0); 1279 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1280 .add(*Dest1) 1281 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1282 } 1283 1284 // Return a register for the source of the merged store after copying the 1285 // original source regs of CI and Paired into it. 1286 Register 1287 SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired, 1288 MachineBasicBlock::iterator InsertBefore, 1289 AMDGPU::OpName OpName) const { 1290 MachineBasicBlock *MBB = CI.I->getParent(); 1291 DebugLoc DL = CI.I->getDebugLoc(); 1292 1293 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired); 1294 1295 // Copy to the new source register. 1296 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1297 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1298 1299 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName); 1300 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName); 1301 1302 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1303 .add(*Src0) 1304 .addImm(SubRegIdx0) 1305 .add(*Src1) 1306 .addImm(SubRegIdx1); 1307 1308 return SrcReg; 1309 } 1310 1311 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1312 if (STM->ldsRequiresM0Init()) 1313 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1314 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1315 } 1316 1317 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1318 if (STM->ldsRequiresM0Init()) 1319 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1320 1321 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1322 : AMDGPU::DS_READ2ST64_B64_gfx9; 1323 } 1324 1325 MachineBasicBlock::iterator 1326 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1327 MachineBasicBlock::iterator InsertBefore) { 1328 MachineBasicBlock *MBB = CI.I->getParent(); 1329 1330 // Be careful, since the addresses could be subregisters themselves in weird 1331 // cases, like vectors of pointers. 1332 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1333 1334 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset); 1335 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset); 1336 unsigned Opc = 1337 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1338 1339 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1340 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1341 1342 const MCInstrDesc &Read2Desc = TII->get(Opc); 1343 1344 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1345 Register DestReg = MRI->createVirtualRegister(SuperRC); 1346 1347 DebugLoc DL = CI.I->getDebugLoc(); 1348 1349 Register BaseReg = AddrReg->getReg(); 1350 unsigned BaseSubReg = AddrReg->getSubReg(); 1351 unsigned BaseRegFlags = 0; 1352 if (CI.BaseOff) { 1353 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1354 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1355 .addImm(CI.BaseOff); 1356 1357 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1358 BaseRegFlags = RegState::Kill; 1359 1360 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1361 .addReg(ImmReg) 1362 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1363 .addImm(0); // clamp bit 1364 BaseSubReg = 0; 1365 } 1366 1367 MachineInstrBuilder Read2 = 1368 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1369 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1370 .addImm(NewOffset0) // offset0 1371 .addImm(NewOffset1) // offset1 1372 .addImm(0) // gds 1373 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1374 1375 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1376 1377 CI.I->eraseFromParent(); 1378 Paired.I->eraseFromParent(); 1379 1380 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1381 return Read2; 1382 } 1383 1384 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1385 if (STM->ldsRequiresM0Init()) 1386 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1387 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1388 : AMDGPU::DS_WRITE2_B64_gfx9; 1389 } 1390 1391 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1392 if (STM->ldsRequiresM0Init()) 1393 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1394 : AMDGPU::DS_WRITE2ST64_B64; 1395 1396 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1397 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1398 } 1399 1400 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1401 CombineInfo &CI, CombineInfo &Paired, 1402 MachineBasicBlock::iterator InsertBefore) { 1403 MachineBasicBlock *MBB = CI.I->getParent(); 1404 1405 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1406 // sure we preserve the subregister index and any register flags set on them. 1407 const MachineOperand *AddrReg = 1408 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1409 const MachineOperand *Data0 = 1410 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1411 const MachineOperand *Data1 = 1412 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1413 1414 unsigned NewOffset0 = CI.Offset; 1415 unsigned NewOffset1 = Paired.Offset; 1416 unsigned Opc = 1417 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1418 1419 if (NewOffset0 > NewOffset1) { 1420 // Canonicalize the merged instruction so the smaller offset comes first. 1421 std::swap(NewOffset0, NewOffset1); 1422 std::swap(Data0, Data1); 1423 } 1424 1425 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1426 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1427 1428 const MCInstrDesc &Write2Desc = TII->get(Opc); 1429 DebugLoc DL = CI.I->getDebugLoc(); 1430 1431 Register BaseReg = AddrReg->getReg(); 1432 unsigned BaseSubReg = AddrReg->getSubReg(); 1433 unsigned BaseRegFlags = 0; 1434 if (CI.BaseOff) { 1435 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1436 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1437 .addImm(CI.BaseOff); 1438 1439 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1440 BaseRegFlags = RegState::Kill; 1441 1442 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1443 .addReg(ImmReg) 1444 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1445 .addImm(0); // clamp bit 1446 BaseSubReg = 0; 1447 } 1448 1449 MachineInstrBuilder Write2 = 1450 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1451 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1452 .add(*Data0) // data0 1453 .add(*Data1) // data1 1454 .addImm(NewOffset0) // offset0 1455 .addImm(NewOffset1) // offset1 1456 .addImm(0) // gds 1457 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1458 1459 CI.I->eraseFromParent(); 1460 Paired.I->eraseFromParent(); 1461 1462 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1463 return Write2; 1464 } 1465 1466 MachineBasicBlock::iterator 1467 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1468 MachineBasicBlock::iterator InsertBefore) { 1469 MachineBasicBlock *MBB = CI.I->getParent(); 1470 DebugLoc DL = CI.I->getDebugLoc(); 1471 const unsigned Opcode = getNewOpcode(CI, Paired); 1472 1473 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1474 1475 Register DestReg = MRI->createVirtualRegister(SuperRC); 1476 unsigned MergedDMask = CI.DMask | Paired.DMask; 1477 unsigned DMaskIdx = 1478 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1479 1480 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1481 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1482 if (I == DMaskIdx) 1483 MIB.addImm(MergedDMask); 1484 else 1485 MIB.add((*CI.I).getOperand(I)); 1486 } 1487 1488 // It shouldn't be possible to get this far if the two instructions 1489 // don't have a single memoperand, because MachineInstr::mayAlias() 1490 // will return true if this is the case. 1491 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1492 1493 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1494 1495 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1496 1497 CI.I->eraseFromParent(); 1498 Paired.I->eraseFromParent(); 1499 return New; 1500 } 1501 1502 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1503 CombineInfo &CI, CombineInfo &Paired, 1504 MachineBasicBlock::iterator InsertBefore) { 1505 MachineBasicBlock *MBB = CI.I->getParent(); 1506 DebugLoc DL = CI.I->getDebugLoc(); 1507 const unsigned Opcode = getNewOpcode(CI, Paired); 1508 1509 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1510 1511 Register DestReg = MRI->createVirtualRegister(SuperRC); 1512 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1513 1514 // It shouldn't be possible to get this far if the two instructions 1515 // don't have a single memoperand, because MachineInstr::mayAlias() 1516 // will return true if this is the case. 1517 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1518 1519 MachineInstrBuilder New = 1520 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1521 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1522 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1523 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1524 New.addImm(MergedOffset); 1525 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1526 1527 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg); 1528 1529 CI.I->eraseFromParent(); 1530 Paired.I->eraseFromParent(); 1531 return New; 1532 } 1533 1534 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1535 CombineInfo &CI, CombineInfo &Paired, 1536 MachineBasicBlock::iterator InsertBefore) { 1537 MachineBasicBlock *MBB = CI.I->getParent(); 1538 DebugLoc DL = CI.I->getDebugLoc(); 1539 1540 const unsigned Opcode = getNewOpcode(CI, Paired); 1541 1542 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1543 1544 // Copy to the new source register. 1545 Register DestReg = MRI->createVirtualRegister(SuperRC); 1546 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1547 1548 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1549 1550 AddressRegs Regs = getRegs(Opcode, *TII); 1551 1552 if (Regs.VAddr) 1553 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1554 1555 // It shouldn't be possible to get this far if the two instructions 1556 // don't have a single memoperand, because MachineInstr::mayAlias() 1557 // will return true if this is the case. 1558 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1559 1560 MachineInstr *New = 1561 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1562 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1563 .addImm(MergedOffset) // offset 1564 .addImm(CI.CPol) // cpol 1565 .addImm(0) // swz 1566 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1567 1568 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1569 1570 CI.I->eraseFromParent(); 1571 Paired.I->eraseFromParent(); 1572 return New; 1573 } 1574 1575 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1576 CombineInfo &CI, CombineInfo &Paired, 1577 MachineBasicBlock::iterator InsertBefore) { 1578 MachineBasicBlock *MBB = CI.I->getParent(); 1579 DebugLoc DL = CI.I->getDebugLoc(); 1580 1581 const unsigned Opcode = getNewOpcode(CI, Paired); 1582 1583 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1584 1585 // Copy to the new source register. 1586 Register DestReg = MRI->createVirtualRegister(SuperRC); 1587 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1588 1589 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1590 1591 AddressRegs Regs = getRegs(Opcode, *TII); 1592 1593 if (Regs.VAddr) 1594 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1595 1596 unsigned JoinedFormat = 1597 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1598 1599 // It shouldn't be possible to get this far if the two instructions 1600 // don't have a single memoperand, because MachineInstr::mayAlias() 1601 // will return true if this is the case. 1602 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1603 1604 MachineInstr *New = 1605 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1606 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1607 .addImm(MergedOffset) // offset 1608 .addImm(JoinedFormat) // format 1609 .addImm(CI.CPol) // cpol 1610 .addImm(0) // swz 1611 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1612 1613 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg); 1614 1615 CI.I->eraseFromParent(); 1616 Paired.I->eraseFromParent(); 1617 return New; 1618 } 1619 1620 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1621 CombineInfo &CI, CombineInfo &Paired, 1622 MachineBasicBlock::iterator InsertBefore) { 1623 MachineBasicBlock *MBB = CI.I->getParent(); 1624 DebugLoc DL = CI.I->getDebugLoc(); 1625 1626 const unsigned Opcode = getNewOpcode(CI, Paired); 1627 1628 Register SrcReg = 1629 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1630 1631 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1632 .addReg(SrcReg, RegState::Kill); 1633 1634 AddressRegs Regs = getRegs(Opcode, *TII); 1635 1636 if (Regs.VAddr) 1637 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1638 1639 unsigned JoinedFormat = 1640 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1641 1642 // It shouldn't be possible to get this far if the two instructions 1643 // don't have a single memoperand, because MachineInstr::mayAlias() 1644 // will return true if this is the case. 1645 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1646 1647 MachineInstr *New = 1648 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1649 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1650 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1651 .addImm(JoinedFormat) // format 1652 .addImm(CI.CPol) // cpol 1653 .addImm(0) // swz 1654 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1655 1656 CI.I->eraseFromParent(); 1657 Paired.I->eraseFromParent(); 1658 return New; 1659 } 1660 1661 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1662 CombineInfo &CI, CombineInfo &Paired, 1663 MachineBasicBlock::iterator InsertBefore) { 1664 MachineBasicBlock *MBB = CI.I->getParent(); 1665 DebugLoc DL = CI.I->getDebugLoc(); 1666 1667 const unsigned Opcode = getNewOpcode(CI, Paired); 1668 1669 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1670 Register DestReg = MRI->createVirtualRegister(SuperRC); 1671 1672 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1673 1674 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1675 MIB.add(*SAddr); 1676 1677 MachineInstr *New = 1678 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1679 .addImm(std::min(CI.Offset, Paired.Offset)) 1680 .addImm(CI.CPol) 1681 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1682 1683 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg); 1684 1685 CI.I->eraseFromParent(); 1686 Paired.I->eraseFromParent(); 1687 return New; 1688 } 1689 1690 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1691 CombineInfo &CI, CombineInfo &Paired, 1692 MachineBasicBlock::iterator InsertBefore) { 1693 MachineBasicBlock *MBB = CI.I->getParent(); 1694 DebugLoc DL = CI.I->getDebugLoc(); 1695 1696 const unsigned Opcode = getNewOpcode(CI, Paired); 1697 1698 Register SrcReg = 1699 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1700 1701 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1702 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1703 .addReg(SrcReg, RegState::Kill); 1704 1705 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1706 MIB.add(*SAddr); 1707 1708 MachineInstr *New = 1709 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1710 .addImm(CI.CPol) 1711 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1712 1713 CI.I->eraseFromParent(); 1714 Paired.I->eraseFromParent(); 1715 return New; 1716 } 1717 1718 static bool needsConstrainedOpcode(const GCNSubtarget &STM, 1719 ArrayRef<MachineMemOperand *> MMOs, 1720 unsigned Width) { 1721 // Conservatively returns true if not found the MMO. 1722 return STM.isXNACKEnabled() && 1723 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4); 1724 } 1725 1726 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1727 const CombineInfo &Paired) { 1728 const unsigned Width = CI.Width + Paired.Width; 1729 1730 switch (getCommonInstClass(CI, Paired)) { 1731 default: 1732 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1733 // FIXME: Handle d16 correctly 1734 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1735 Width); 1736 case TBUFFER_LOAD: 1737 case TBUFFER_STORE: 1738 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1739 Width); 1740 1741 case UNKNOWN: 1742 llvm_unreachable("Unknown instruction class"); 1743 case S_BUFFER_LOAD_IMM: { 1744 // If XNACK is enabled, use the constrained opcodes when the first load is 1745 // under-aligned. 1746 bool NeedsConstrainedOpc = 1747 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1748 switch (Width) { 1749 default: 1750 return 0; 1751 case 2: 1752 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec 1753 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1754 case 3: 1755 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec 1756 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM; 1757 case 4: 1758 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec 1759 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1760 case 8: 1761 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec 1762 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1763 } 1764 } 1765 case S_BUFFER_LOAD_SGPR_IMM: { 1766 // If XNACK is enabled, use the constrained opcodes when the first load is 1767 // under-aligned. 1768 bool NeedsConstrainedOpc = 1769 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1770 switch (Width) { 1771 default: 1772 return 0; 1773 case 2: 1774 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec 1775 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1776 case 3: 1777 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec 1778 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM; 1779 case 4: 1780 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec 1781 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1782 case 8: 1783 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec 1784 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1785 } 1786 } 1787 case S_LOAD_IMM: { 1788 // If XNACK is enabled, use the constrained opcodes when the first load is 1789 // under-aligned. 1790 bool NeedsConstrainedOpc = 1791 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width); 1792 switch (Width) { 1793 default: 1794 return 0; 1795 case 2: 1796 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec 1797 : AMDGPU::S_LOAD_DWORDX2_IMM; 1798 case 3: 1799 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec 1800 : AMDGPU::S_LOAD_DWORDX3_IMM; 1801 case 4: 1802 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec 1803 : AMDGPU::S_LOAD_DWORDX4_IMM; 1804 case 8: 1805 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec 1806 : AMDGPU::S_LOAD_DWORDX8_IMM; 1807 } 1808 } 1809 case GLOBAL_LOAD: 1810 switch (Width) { 1811 default: 1812 return 0; 1813 case 2: 1814 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1815 case 3: 1816 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1817 case 4: 1818 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1819 } 1820 case GLOBAL_LOAD_SADDR: 1821 switch (Width) { 1822 default: 1823 return 0; 1824 case 2: 1825 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1826 case 3: 1827 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1828 case 4: 1829 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1830 } 1831 case GLOBAL_STORE: 1832 switch (Width) { 1833 default: 1834 return 0; 1835 case 2: 1836 return AMDGPU::GLOBAL_STORE_DWORDX2; 1837 case 3: 1838 return AMDGPU::GLOBAL_STORE_DWORDX3; 1839 case 4: 1840 return AMDGPU::GLOBAL_STORE_DWORDX4; 1841 } 1842 case GLOBAL_STORE_SADDR: 1843 switch (Width) { 1844 default: 1845 return 0; 1846 case 2: 1847 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1848 case 3: 1849 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1850 case 4: 1851 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1852 } 1853 case FLAT_LOAD: 1854 switch (Width) { 1855 default: 1856 return 0; 1857 case 2: 1858 return AMDGPU::FLAT_LOAD_DWORDX2; 1859 case 3: 1860 return AMDGPU::FLAT_LOAD_DWORDX3; 1861 case 4: 1862 return AMDGPU::FLAT_LOAD_DWORDX4; 1863 } 1864 case FLAT_STORE: 1865 switch (Width) { 1866 default: 1867 return 0; 1868 case 2: 1869 return AMDGPU::FLAT_STORE_DWORDX2; 1870 case 3: 1871 return AMDGPU::FLAT_STORE_DWORDX3; 1872 case 4: 1873 return AMDGPU::FLAT_STORE_DWORDX4; 1874 } 1875 case MIMG: 1876 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1877 "No overlaps"); 1878 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1879 } 1880 } 1881 1882 std::pair<unsigned, unsigned> 1883 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1884 const CombineInfo &Paired) { 1885 assert((CI.InstClass != MIMG || 1886 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1887 CI.Width + Paired.Width)) && 1888 "No overlaps"); 1889 1890 unsigned Idx0; 1891 unsigned Idx1; 1892 1893 static const unsigned Idxs[5][4] = { 1894 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1895 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1896 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1897 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1898 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1899 }; 1900 1901 assert(CI.Width >= 1 && CI.Width <= 4); 1902 assert(Paired.Width >= 1 && Paired.Width <= 4); 1903 1904 if (Paired < CI) { 1905 Idx1 = Idxs[0][Paired.Width - 1]; 1906 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1907 } else { 1908 Idx0 = Idxs[0][CI.Width - 1]; 1909 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1910 } 1911 1912 return {Idx0, Idx1}; 1913 } 1914 1915 const TargetRegisterClass * 1916 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1917 const CombineInfo &Paired) const { 1918 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1919 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1920 switch (CI.Width + Paired.Width) { 1921 default: 1922 return nullptr; 1923 case 2: 1924 return &AMDGPU::SReg_64_XEXECRegClass; 1925 case 3: 1926 return &AMDGPU::SGPR_96RegClass; 1927 case 4: 1928 return &AMDGPU::SGPR_128RegClass; 1929 case 8: 1930 return &AMDGPU::SGPR_256RegClass; 1931 case 16: 1932 return &AMDGPU::SGPR_512RegClass; 1933 } 1934 } 1935 1936 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1937 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1938 ? TRI->getAGPRClassForBitWidth(BitWidth) 1939 : TRI->getVGPRClassForBitWidth(BitWidth); 1940 } 1941 1942 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1943 CombineInfo &CI, CombineInfo &Paired, 1944 MachineBasicBlock::iterator InsertBefore) { 1945 MachineBasicBlock *MBB = CI.I->getParent(); 1946 DebugLoc DL = CI.I->getDebugLoc(); 1947 1948 const unsigned Opcode = getNewOpcode(CI, Paired); 1949 1950 Register SrcReg = 1951 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata); 1952 1953 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1954 .addReg(SrcReg, RegState::Kill); 1955 1956 AddressRegs Regs = getRegs(Opcode, *TII); 1957 1958 if (Regs.VAddr) 1959 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1960 1961 1962 // It shouldn't be possible to get this far if the two instructions 1963 // don't have a single memoperand, because MachineInstr::mayAlias() 1964 // will return true if this is the case. 1965 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1966 1967 MachineInstr *New = 1968 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1969 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1970 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1971 .addImm(CI.CPol) // cpol 1972 .addImm(0) // swz 1973 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1974 1975 CI.I->eraseFromParent(); 1976 Paired.I->eraseFromParent(); 1977 return New; 1978 } 1979 1980 MachineOperand 1981 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1982 APInt V(32, Val, true); 1983 if (TII->isInlineConstant(V)) 1984 return MachineOperand::CreateImm(Val); 1985 1986 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1987 MachineInstr *Mov = 1988 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1989 TII->get(AMDGPU::S_MOV_B32), Reg) 1990 .addImm(Val); 1991 (void)Mov; 1992 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1993 return MachineOperand::CreateReg(Reg, false); 1994 } 1995 1996 // Compute base address using Addr and return the final register. 1997 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1998 const MemAddress &Addr) const { 1999 MachineBasicBlock *MBB = MI.getParent(); 2000 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2001 DebugLoc DL = MI.getDebugLoc(); 2002 2003 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 2004 Addr.Base.LoSubReg) && 2005 "Expected 32-bit Base-Register-Low!!"); 2006 2007 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 2008 Addr.Base.HiSubReg) && 2009 "Expected 32-bit Base-Register-Hi!!"); 2010 2011 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 2012 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 2013 MachineOperand OffsetHi = 2014 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 2015 2016 const auto *CarryRC = TRI->getWaveMaskRegClass(); 2017 Register CarryReg = MRI->createVirtualRegister(CarryRC); 2018 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 2019 2020 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2021 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 2022 MachineInstr *LoHalf = 2023 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 2024 .addReg(CarryReg, RegState::Define) 2025 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 2026 .add(OffsetLo) 2027 .addImm(0); // clamp bit 2028 (void)LoHalf; 2029 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 2030 2031 MachineInstr *HiHalf = 2032 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 2033 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 2034 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 2035 .add(OffsetHi) 2036 .addReg(CarryReg, RegState::Kill) 2037 .addImm(0); // clamp bit 2038 (void)HiHalf; 2039 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 2040 2041 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 2042 MachineInstr *FullBase = 2043 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 2044 .addReg(DestSub0) 2045 .addImm(AMDGPU::sub0) 2046 .addReg(DestSub1) 2047 .addImm(AMDGPU::sub1); 2048 (void)FullBase; 2049 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 2050 2051 return FullDestReg; 2052 } 2053 2054 // Update base and offset with the NewBase and NewOffset in MI. 2055 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 2056 Register NewBase, 2057 int32_t NewOffset) const { 2058 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2059 Base->setReg(NewBase); 2060 Base->setIsKill(false); 2061 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 2062 } 2063 2064 std::optional<int32_t> 2065 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 2066 if (Op.isImm()) 2067 return Op.getImm(); 2068 2069 if (!Op.isReg()) 2070 return std::nullopt; 2071 2072 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 2073 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 2074 !Def->getOperand(1).isImm()) 2075 return std::nullopt; 2076 2077 return Def->getOperand(1).getImm(); 2078 } 2079 2080 // Analyze Base and extracts: 2081 // - 32bit base registers, subregisters 2082 // - 64bit constant offset 2083 // Expecting base computation as: 2084 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2085 // %LO:vgpr_32, %c:sreg_64_xexec = 2086 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2087 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2088 // %Base:vreg_64 = 2089 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2090 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2091 MemAddress &Addr) const { 2092 if (!Base.isReg()) 2093 return; 2094 2095 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2096 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2097 || Def->getNumOperands() != 5) 2098 return; 2099 2100 MachineOperand BaseLo = Def->getOperand(1); 2101 MachineOperand BaseHi = Def->getOperand(3); 2102 if (!BaseLo.isReg() || !BaseHi.isReg()) 2103 return; 2104 2105 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2106 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2107 2108 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2109 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2110 return; 2111 2112 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2113 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2114 2115 auto Offset0P = extractConstOffset(*Src0); 2116 if (Offset0P) 2117 BaseLo = *Src1; 2118 else { 2119 if (!(Offset0P = extractConstOffset(*Src1))) 2120 return; 2121 BaseLo = *Src0; 2122 } 2123 2124 if (!BaseLo.isReg()) 2125 return; 2126 2127 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2128 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2129 2130 if (Src0->isImm()) 2131 std::swap(Src0, Src1); 2132 2133 if (!Src1->isImm() || Src0->isImm()) 2134 return; 2135 2136 uint64_t Offset1 = Src1->getImm(); 2137 BaseHi = *Src0; 2138 2139 if (!BaseHi.isReg()) 2140 return; 2141 2142 Addr.Base.LoReg = BaseLo.getReg(); 2143 Addr.Base.HiReg = BaseHi.getReg(); 2144 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2145 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2146 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2147 } 2148 2149 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2150 MachineInstr &MI, 2151 MemInfoMap &Visited, 2152 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2153 2154 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) 2155 return false; 2156 2157 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2158 if (SIInstrInfo::isFLATScratch(MI)) 2159 return false; 2160 2161 unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS 2162 : AMDGPUAS::FLAT_ADDRESS; 2163 2164 if (AnchorList.count(&MI)) 2165 return false; 2166 2167 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2168 2169 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2170 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2171 return false; 2172 } 2173 2174 // Step1: Find the base-registers and a 64bit constant offset. 2175 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2176 auto [It, Inserted] = Visited.try_emplace(&MI); 2177 MemAddress MAddr; 2178 if (Inserted) { 2179 processBaseWithConstOffset(Base, MAddr); 2180 It->second = MAddr; 2181 } else 2182 MAddr = It->second; 2183 2184 if (MAddr.Offset == 0) { 2185 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2186 " constant offsets that can be promoted.\n";); 2187 return false; 2188 } 2189 2190 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", " 2191 << printReg(MAddr.Base.LoReg, TRI) 2192 << "} Offset: " << MAddr.Offset << "\n\n";); 2193 2194 // Step2: Traverse through MI's basic block and find an anchor(that has the 2195 // same base-registers) with the highest 13bit distance from MI's offset. 2196 // E.g. (64bit loads) 2197 // bb: 2198 // addr1 = &a + 4096; load1 = load(addr1, 0) 2199 // addr2 = &a + 6144; load2 = load(addr2, 0) 2200 // addr3 = &a + 8192; load3 = load(addr3, 0) 2201 // addr4 = &a + 10240; load4 = load(addr4, 0) 2202 // addr5 = &a + 12288; load5 = load(addr5, 0) 2203 // 2204 // Starting from the first load, the optimization will try to find a new base 2205 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2206 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2207 // as the new-base(anchor) because of the maximum distance which can 2208 // accommodate more intermediate bases presumably. 2209 // 2210 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2211 // (&a + 8192) for load1, load2, load4. 2212 // addr = &a + 8192 2213 // load1 = load(addr, -4096) 2214 // load2 = load(addr, -2048) 2215 // load3 = load(addr, 0) 2216 // load4 = load(addr, 2048) 2217 // addr5 = &a + 12288; load5 = load(addr5, 0) 2218 // 2219 MachineInstr *AnchorInst = nullptr; 2220 MemAddress AnchorAddr; 2221 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2222 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2223 2224 MachineBasicBlock *MBB = MI.getParent(); 2225 MachineBasicBlock::iterator E = MBB->end(); 2226 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2227 ++MBBI; 2228 const SITargetLowering *TLI = 2229 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2230 2231 for ( ; MBBI != E; ++MBBI) { 2232 MachineInstr &MINext = *MBBI; 2233 // TODO: Support finding an anchor(with same base) from store addresses or 2234 // any other load addresses where the opcodes are different. 2235 if (MINext.getOpcode() != MI.getOpcode() || 2236 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2237 continue; 2238 2239 const MachineOperand &BaseNext = 2240 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2241 MemAddress MAddrNext; 2242 auto [It, Inserted] = Visited.try_emplace(&MINext); 2243 if (Inserted) { 2244 processBaseWithConstOffset(BaseNext, MAddrNext); 2245 It->second = MAddrNext; 2246 } else 2247 MAddrNext = It->second; 2248 2249 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2250 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2251 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2252 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2253 continue; 2254 2255 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset); 2256 2257 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2258 TargetLoweringBase::AddrMode AM; 2259 AM.HasBaseReg = true; 2260 AM.BaseOffs = Dist; 2261 if (TLI->isLegalFlatAddressingMode(AM, AS) && 2262 (uint32_t)std::abs(Dist) > MaxDist) { 2263 MaxDist = std::abs(Dist); 2264 2265 AnchorAddr = MAddrNext; 2266 AnchorInst = &MINext; 2267 } 2268 } 2269 2270 if (AnchorInst) { 2271 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2272 AnchorInst->dump()); 2273 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2274 << AnchorAddr.Offset << "\n\n"); 2275 2276 // Instead of moving up, just re-compute anchor-instruction's base address. 2277 Register Base = computeBase(MI, AnchorAddr); 2278 2279 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2280 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2281 2282 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) { 2283 TargetLoweringBase::AddrMode AM; 2284 AM.HasBaseReg = true; 2285 AM.BaseOffs = OtherOffset - AnchorAddr.Offset; 2286 2287 if (TLI->isLegalFlatAddressingMode(AM, AS)) { 2288 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")"; 2289 OtherMI->dump()); 2290 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset); 2291 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump()); 2292 } 2293 } 2294 AnchorList.insert(AnchorInst); 2295 return true; 2296 } 2297 2298 return false; 2299 } 2300 2301 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2302 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2303 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2304 if (AddrList.front().InstClass == CI.InstClass && 2305 AddrList.front().IsAGPR == CI.IsAGPR && 2306 AddrList.front().hasSameBaseAddress(CI)) { 2307 AddrList.emplace_back(CI); 2308 return; 2309 } 2310 } 2311 2312 // Base address not found, so add a new list. 2313 MergeableInsts.emplace_back(1, CI); 2314 } 2315 2316 std::pair<MachineBasicBlock::iterator, bool> 2317 SILoadStoreOptimizer::collectMergeableInsts( 2318 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2319 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2320 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2321 bool Modified = false; 2322 2323 // Sort potential mergeable instructions into lists. One list per base address. 2324 unsigned Order = 0; 2325 MachineBasicBlock::iterator BlockI = Begin; 2326 for (; BlockI != End; ++BlockI) { 2327 MachineInstr &MI = *BlockI; 2328 2329 // We run this before checking if an address is mergeable, because it can produce 2330 // better code even if the instructions aren't mergeable. 2331 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2332 Modified = true; 2333 2334 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2335 // barriers. We can look after this barrier for separate merges. 2336 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2337 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2338 2339 // Search will resume after this instruction in a separate merge list. 2340 ++BlockI; 2341 break; 2342 } 2343 2344 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2345 if (InstClass == UNKNOWN) 2346 continue; 2347 2348 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2349 int Swizzled = 2350 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2351 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2352 continue; 2353 2354 CombineInfo CI; 2355 CI.setMI(MI, *this); 2356 CI.Order = Order++; 2357 2358 if (!CI.hasMergeableAddress(*MRI)) 2359 continue; 2360 2361 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2362 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2363 // operands. However we are reporting that ds_write2 shall have 2364 // only VGPR data so that machine copy propagation does not 2365 // create an illegal instruction with a VGPR and AGPR sources. 2366 // Consequenctially if we create such instruction the verifier 2367 // will complain. 2368 continue; 2369 } 2370 2371 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2372 2373 addInstToMergeableList(CI, MergeableInsts); 2374 } 2375 2376 // At this point we have lists of Mergeable instructions. 2377 // 2378 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2379 // list try to find an instruction that can be merged with I. If an instruction 2380 // is found, it is stored in the Paired field. If no instructions are found, then 2381 // the CombineInfo object is deleted from the list. 2382 2383 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2384 E = MergeableInsts.end(); I != E;) { 2385 2386 std::list<CombineInfo> &MergeList = *I; 2387 if (MergeList.size() <= 1) { 2388 // This means we have found only one instruction with a given address 2389 // that can be merged, and we need at least 2 instructions to do a merge, 2390 // so this list can be discarded. 2391 I = MergeableInsts.erase(I); 2392 continue; 2393 } 2394 2395 // Sort the lists by offsets, this way mergeable instructions will be 2396 // adjacent to each other in the list, which will make it easier to find 2397 // matches. 2398 MergeList.sort( 2399 [] (const CombineInfo &A, const CombineInfo &B) { 2400 return A.Offset < B.Offset; 2401 }); 2402 ++I; 2403 } 2404 2405 return {BlockI, Modified}; 2406 } 2407 2408 // Scan through looking for adjacent LDS operations with constant offsets from 2409 // the same base register. We rely on the scheduler to do the hard work of 2410 // clustering nearby loads, and assume these are all adjacent. 2411 bool SILoadStoreOptimizer::optimizeBlock( 2412 std::list<std::list<CombineInfo> > &MergeableInsts) { 2413 bool Modified = false; 2414 2415 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2416 E = MergeableInsts.end(); I != E;) { 2417 std::list<CombineInfo> &MergeList = *I; 2418 2419 bool OptimizeListAgain = false; 2420 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2421 // We weren't able to make any changes, so delete the list so we don't 2422 // process the same instructions the next time we try to optimize this 2423 // block. 2424 I = MergeableInsts.erase(I); 2425 continue; 2426 } 2427 2428 Modified = true; 2429 2430 // We made changes, but also determined that there were no more optimization 2431 // opportunities, so we don't need to reprocess the list 2432 if (!OptimizeListAgain) { 2433 I = MergeableInsts.erase(I); 2434 continue; 2435 } 2436 OptimizeAgain = true; 2437 } 2438 return Modified; 2439 } 2440 2441 bool 2442 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2443 std::list<CombineInfo> &MergeList, 2444 bool &OptimizeListAgain) { 2445 if (MergeList.empty()) 2446 return false; 2447 2448 bool Modified = false; 2449 2450 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2451 Next = std::next(I)) { 2452 2453 auto First = I; 2454 auto Second = Next; 2455 2456 if ((*First).Order > (*Second).Order) 2457 std::swap(First, Second); 2458 CombineInfo &CI = *First; 2459 CombineInfo &Paired = *Second; 2460 2461 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2462 if (!Where) { 2463 ++I; 2464 continue; 2465 } 2466 2467 Modified = true; 2468 2469 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2470 2471 MachineBasicBlock::iterator NewMI; 2472 switch (CI.InstClass) { 2473 default: 2474 llvm_unreachable("unknown InstClass"); 2475 break; 2476 case DS_READ: 2477 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2478 break; 2479 case DS_WRITE: 2480 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2481 break; 2482 case S_BUFFER_LOAD_IMM: 2483 case S_BUFFER_LOAD_SGPR_IMM: 2484 case S_LOAD_IMM: 2485 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2486 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2487 break; 2488 case BUFFER_LOAD: 2489 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2490 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2491 break; 2492 case BUFFER_STORE: 2493 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2494 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2495 break; 2496 case MIMG: 2497 NewMI = mergeImagePair(CI, Paired, Where->I); 2498 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2499 break; 2500 case TBUFFER_LOAD: 2501 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2502 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2503 break; 2504 case TBUFFER_STORE: 2505 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2506 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2507 break; 2508 case FLAT_LOAD: 2509 case GLOBAL_LOAD: 2510 case GLOBAL_LOAD_SADDR: 2511 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2512 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2513 break; 2514 case FLAT_STORE: 2515 case GLOBAL_STORE: 2516 case GLOBAL_STORE_SADDR: 2517 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2518 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2519 break; 2520 } 2521 CI.setMI(NewMI, *this); 2522 CI.Order = Where->Order; 2523 if (I == Second) 2524 I = Next; 2525 2526 MergeList.erase(Second); 2527 } 2528 2529 return Modified; 2530 } 2531 2532 bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) { 2533 if (skipFunction(MF.getFunction())) 2534 return false; 2535 return SILoadStoreOptimizer( 2536 &getAnalysis<AAResultsWrapperPass>().getAAResults()) 2537 .run(MF); 2538 } 2539 2540 bool SILoadStoreOptimizer::run(MachineFunction &MF) { 2541 STM = &MF.getSubtarget<GCNSubtarget>(); 2542 if (!STM->loadStoreOptEnabled()) 2543 return false; 2544 2545 TII = STM->getInstrInfo(); 2546 TRI = &TII->getRegisterInfo(); 2547 2548 MRI = &MF.getRegInfo(); 2549 2550 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2551 2552 bool Modified = false; 2553 2554 // Contains the list of instructions for which constant offsets are being 2555 // promoted to the IMM. This is tracked for an entire block at time. 2556 SmallPtrSet<MachineInstr *, 4> AnchorList; 2557 MemInfoMap Visited; 2558 2559 for (MachineBasicBlock &MBB : MF) { 2560 MachineBasicBlock::iterator SectionEnd; 2561 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2562 I = SectionEnd) { 2563 bool CollectModified; 2564 std::list<std::list<CombineInfo>> MergeableInsts; 2565 2566 // First pass: Collect list of all instructions we know how to merge in a 2567 // subset of the block. 2568 std::tie(SectionEnd, CollectModified) = 2569 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2570 2571 Modified |= CollectModified; 2572 2573 do { 2574 OptimizeAgain = false; 2575 Modified |= optimizeBlock(MergeableInsts); 2576 } while (OptimizeAgain); 2577 } 2578 2579 Visited.clear(); 2580 AnchorList.clear(); 2581 } 2582 2583 return Modified; 2584 } 2585 2586 PreservedAnalyses 2587 SILoadStoreOptimizerPass::run(MachineFunction &MF, 2588 MachineFunctionAnalysisManager &MFAM) { 2589 MFPropsModifier _(*this, MF); 2590 2591 if (MF.getFunction().hasOptNone()) 2592 return PreservedAnalyses::all(); 2593 2594 auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) 2595 .getManager(); 2596 AAResults &AA = FAM.getResult<AAManager>(MF.getFunction()); 2597 2598 bool Changed = SILoadStoreOptimizer(&AA).run(MF); 2599 if (!Changed) 2600 return PreservedAnalyses::all(); 2601 2602 PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); 2603 PA.preserveSet<CFGAnalyses>(); 2604 return PA; 2605 } 2606