1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge physical reg addresses. 165 if (AddrOp->getReg().isPhysical()) 166 return false; 167 168 // If an address has only one use then there will be no other 169 // instructions with the same address, so we can't merge this one. 170 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 171 return false; 172 } 173 return true; 174 } 175 176 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 177 178 // Compare by pointer order. 179 bool operator<(const CombineInfo& Other) const { 180 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 181 } 182 }; 183 184 struct BaseRegisters { 185 Register LoReg; 186 Register HiReg; 187 188 unsigned LoSubReg = 0; 189 unsigned HiSubReg = 0; 190 }; 191 192 struct MemAddress { 193 BaseRegisters Base; 194 int64_t Offset = 0; 195 }; 196 197 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 198 199 private: 200 const GCNSubtarget *STM = nullptr; 201 const SIInstrInfo *TII = nullptr; 202 const SIRegisterInfo *TRI = nullptr; 203 MachineRegisterInfo *MRI = nullptr; 204 AliasAnalysis *AA = nullptr; 205 bool OptimizeAgain; 206 207 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 208 const DenseSet<Register> &ARegUses, 209 const MachineInstr &A, const MachineInstr &B) const; 210 static bool dmasksCanBeCombined(const CombineInfo &CI, 211 const SIInstrInfo &TII, 212 const CombineInfo &Paired); 213 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 214 CombineInfo &Paired, bool Modify = false); 215 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 216 const CombineInfo &Paired); 217 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 218 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 219 const CombineInfo &Paired); 220 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 223 224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 225 226 unsigned read2Opcode(unsigned EltSize) const; 227 unsigned read2ST64Opcode(unsigned EltSize) const; 228 MachineBasicBlock::iterator 229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 232 unsigned write2Opcode(unsigned EltSize) const; 233 unsigned write2ST64Opcode(unsigned EltSize) const; 234 MachineBasicBlock::iterator 235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 MachineBasicBlock::iterator 241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 242 MachineBasicBlock::iterator InsertBefore); 243 MachineBasicBlock::iterator 244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 245 MachineBasicBlock::iterator InsertBefore); 246 MachineBasicBlock::iterator 247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 248 MachineBasicBlock::iterator InsertBefore); 249 MachineBasicBlock::iterator 250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 251 MachineBasicBlock::iterator InsertBefore); 252 MachineBasicBlock::iterator 253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 254 MachineBasicBlock::iterator InsertBefore); 255 MachineBasicBlock::iterator 256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 257 MachineBasicBlock::iterator InsertBefore); 258 MachineBasicBlock::iterator 259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 260 MachineBasicBlock::iterator InsertBefore); 261 262 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 263 int32_t NewOffset) const; 264 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 265 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 266 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 267 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 268 /// Promotes constant offset to the immediate by adjusting the base. It 269 /// tries to use a base from the nearby instructions that allows it to have 270 /// a 13bit constant offset which gets promoted to the immediate. 271 bool promoteConstantOffsetToImm(MachineInstr &CI, 272 MemInfoMap &Visited, 273 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 274 void addInstToMergeableList(const CombineInfo &CI, 275 std::list<std::list<CombineInfo> > &MergeableInsts) const; 276 277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 278 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 279 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 280 std::list<std::list<CombineInfo>> &MergeableInsts) const; 281 282 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 283 const CombineInfo &Paired); 284 285 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 286 const CombineInfo &Paired); 287 288 public: 289 static char ID; 290 291 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 292 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 293 } 294 295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 296 bool &OptimizeListAgain); 297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 298 299 bool runOnMachineFunction(MachineFunction &MF) override; 300 301 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 302 303 void getAnalysisUsage(AnalysisUsage &AU) const override { 304 AU.setPreservesCFG(); 305 AU.addRequired<AAResultsWrapperPass>(); 306 307 MachineFunctionPass::getAnalysisUsage(AU); 308 } 309 310 MachineFunctionProperties getRequiredProperties() const override { 311 return MachineFunctionProperties() 312 .set(MachineFunctionProperties::Property::IsSSA); 313 } 314 }; 315 316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 317 const unsigned Opc = MI.getOpcode(); 318 319 if (TII.isMUBUF(Opc)) { 320 // FIXME: Handle d16 correctly 321 return AMDGPU::getMUBUFElements(Opc); 322 } 323 if (TII.isMIMG(MI)) { 324 uint64_t DMaskImm = 325 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 326 return llvm::popcount(DMaskImm); 327 } 328 if (TII.isMTBUF(Opc)) { 329 return AMDGPU::getMTBUFElements(Opc); 330 } 331 332 switch (Opc) { 333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 335 case AMDGPU::S_LOAD_DWORD_IMM: 336 case AMDGPU::GLOBAL_LOAD_DWORD: 337 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 338 case AMDGPU::GLOBAL_STORE_DWORD: 339 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 340 case AMDGPU::FLAT_LOAD_DWORD: 341 case AMDGPU::FLAT_STORE_DWORD: 342 return 1; 343 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 345 case AMDGPU::S_LOAD_DWORDX2_IMM: 346 case AMDGPU::GLOBAL_LOAD_DWORDX2: 347 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 348 case AMDGPU::GLOBAL_STORE_DWORDX2: 349 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 350 case AMDGPU::FLAT_LOAD_DWORDX2: 351 case AMDGPU::FLAT_STORE_DWORDX2: 352 return 2; 353 case AMDGPU::GLOBAL_LOAD_DWORDX3: 354 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 355 case AMDGPU::GLOBAL_STORE_DWORDX3: 356 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 357 case AMDGPU::FLAT_LOAD_DWORDX3: 358 case AMDGPU::FLAT_STORE_DWORDX3: 359 return 3; 360 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 361 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 362 case AMDGPU::S_LOAD_DWORDX4_IMM: 363 case AMDGPU::GLOBAL_LOAD_DWORDX4: 364 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 365 case AMDGPU::GLOBAL_STORE_DWORDX4: 366 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 367 case AMDGPU::FLAT_LOAD_DWORDX4: 368 case AMDGPU::FLAT_STORE_DWORDX4: 369 return 4; 370 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 371 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 372 case AMDGPU::S_LOAD_DWORDX8_IMM: 373 return 8; 374 case AMDGPU::DS_READ_B32: [[fallthrough]]; 375 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 376 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 377 case AMDGPU::DS_WRITE_B32_gfx9: 378 return 1; 379 case AMDGPU::DS_READ_B64: [[fallthrough]]; 380 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 382 case AMDGPU::DS_WRITE_B64_gfx9: 383 return 2; 384 default: 385 return 0; 386 } 387 } 388 389 /// Maps instruction opcode to enum InstClassEnum. 390 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 391 switch (Opc) { 392 default: 393 if (TII.isMUBUF(Opc)) { 394 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 395 default: 396 return UNKNOWN; 397 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 398 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 399 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 400 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 401 return BUFFER_LOAD; 402 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 403 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 404 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 405 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 406 return BUFFER_STORE; 407 } 408 } 409 if (TII.isMIMG(Opc)) { 410 // Ignore instructions encoded without vaddr. 411 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 412 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 413 return UNKNOWN; 414 // Ignore BVH instructions 415 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 416 return UNKNOWN; 417 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 418 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 419 TII.isGather4(Opc)) 420 return UNKNOWN; 421 return MIMG; 422 } 423 if (TII.isMTBUF(Opc)) { 424 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 425 default: 426 return UNKNOWN; 427 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 428 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 429 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 430 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: 432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: 433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: 434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: 435 return TBUFFER_LOAD; 436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 440 return TBUFFER_STORE; 441 } 442 } 443 return UNKNOWN; 444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 448 return S_BUFFER_LOAD_IMM; 449 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 450 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 451 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 452 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 453 return S_BUFFER_LOAD_SGPR_IMM; 454 case AMDGPU::S_LOAD_DWORD_IMM: 455 case AMDGPU::S_LOAD_DWORDX2_IMM: 456 case AMDGPU::S_LOAD_DWORDX4_IMM: 457 case AMDGPU::S_LOAD_DWORDX8_IMM: 458 return S_LOAD_IMM; 459 case AMDGPU::DS_READ_B32: 460 case AMDGPU::DS_READ_B32_gfx9: 461 case AMDGPU::DS_READ_B64: 462 case AMDGPU::DS_READ_B64_gfx9: 463 return DS_READ; 464 case AMDGPU::DS_WRITE_B32: 465 case AMDGPU::DS_WRITE_B32_gfx9: 466 case AMDGPU::DS_WRITE_B64: 467 case AMDGPU::DS_WRITE_B64_gfx9: 468 return DS_WRITE; 469 case AMDGPU::GLOBAL_LOAD_DWORD: 470 case AMDGPU::GLOBAL_LOAD_DWORDX2: 471 case AMDGPU::GLOBAL_LOAD_DWORDX3: 472 case AMDGPU::GLOBAL_LOAD_DWORDX4: 473 case AMDGPU::FLAT_LOAD_DWORD: 474 case AMDGPU::FLAT_LOAD_DWORDX2: 475 case AMDGPU::FLAT_LOAD_DWORDX3: 476 case AMDGPU::FLAT_LOAD_DWORDX4: 477 return FLAT_LOAD; 478 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 479 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 480 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 481 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 482 return GLOBAL_LOAD_SADDR; 483 case AMDGPU::GLOBAL_STORE_DWORD: 484 case AMDGPU::GLOBAL_STORE_DWORDX2: 485 case AMDGPU::GLOBAL_STORE_DWORDX3: 486 case AMDGPU::GLOBAL_STORE_DWORDX4: 487 case AMDGPU::FLAT_STORE_DWORD: 488 case AMDGPU::FLAT_STORE_DWORDX2: 489 case AMDGPU::FLAT_STORE_DWORDX3: 490 case AMDGPU::FLAT_STORE_DWORDX4: 491 return FLAT_STORE; 492 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 493 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 494 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 495 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 496 return GLOBAL_STORE_SADDR; 497 } 498 } 499 500 /// Determines instruction subclass from opcode. Only instructions 501 /// of the same subclass can be merged together. The merged instruction may have 502 /// a different subclass but must have the same class. 503 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 504 switch (Opc) { 505 default: 506 if (TII.isMUBUF(Opc)) 507 return AMDGPU::getMUBUFBaseOpcode(Opc); 508 if (TII.isMIMG(Opc)) { 509 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 510 assert(Info); 511 return Info->BaseOpcode; 512 } 513 if (TII.isMTBUF(Opc)) 514 return AMDGPU::getMTBUFBaseOpcode(Opc); 515 return -1; 516 case AMDGPU::DS_READ_B32: 517 case AMDGPU::DS_READ_B32_gfx9: 518 case AMDGPU::DS_READ_B64: 519 case AMDGPU::DS_READ_B64_gfx9: 520 case AMDGPU::DS_WRITE_B32: 521 case AMDGPU::DS_WRITE_B32_gfx9: 522 case AMDGPU::DS_WRITE_B64: 523 case AMDGPU::DS_WRITE_B64_gfx9: 524 return Opc; 525 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 526 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 529 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 530 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 531 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 534 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 535 case AMDGPU::S_LOAD_DWORD_IMM: 536 case AMDGPU::S_LOAD_DWORDX2_IMM: 537 case AMDGPU::S_LOAD_DWORDX4_IMM: 538 case AMDGPU::S_LOAD_DWORDX8_IMM: 539 return AMDGPU::S_LOAD_DWORD_IMM; 540 case AMDGPU::GLOBAL_LOAD_DWORD: 541 case AMDGPU::GLOBAL_LOAD_DWORDX2: 542 case AMDGPU::GLOBAL_LOAD_DWORDX3: 543 case AMDGPU::GLOBAL_LOAD_DWORDX4: 544 case AMDGPU::FLAT_LOAD_DWORD: 545 case AMDGPU::FLAT_LOAD_DWORDX2: 546 case AMDGPU::FLAT_LOAD_DWORDX3: 547 case AMDGPU::FLAT_LOAD_DWORDX4: 548 return AMDGPU::FLAT_LOAD_DWORD; 549 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 550 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 551 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 552 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 553 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 554 case AMDGPU::GLOBAL_STORE_DWORD: 555 case AMDGPU::GLOBAL_STORE_DWORDX2: 556 case AMDGPU::GLOBAL_STORE_DWORDX3: 557 case AMDGPU::GLOBAL_STORE_DWORDX4: 558 case AMDGPU::FLAT_STORE_DWORD: 559 case AMDGPU::FLAT_STORE_DWORDX2: 560 case AMDGPU::FLAT_STORE_DWORDX3: 561 case AMDGPU::FLAT_STORE_DWORDX4: 562 return AMDGPU::FLAT_STORE_DWORD; 563 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 564 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 565 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 566 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 567 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 568 } 569 } 570 571 // GLOBAL loads and stores are classified as FLAT initially. If both combined 572 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 573 // If either or both instructions are non segment specific FLAT the resulting 574 // combined operation will be FLAT, potentially promoting one of the GLOBAL 575 // operations to FLAT. 576 // For other instructions return the original unmodified class. 577 InstClassEnum 578 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 579 const CombineInfo &Paired) { 580 assert(CI.InstClass == Paired.InstClass); 581 582 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 583 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 584 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 585 586 return CI.InstClass; 587 } 588 589 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 590 AddressRegs Result; 591 592 if (TII.isMUBUF(Opc)) { 593 if (AMDGPU::getMUBUFHasVAddr(Opc)) 594 Result.VAddr = true; 595 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 596 Result.SRsrc = true; 597 if (AMDGPU::getMUBUFHasSoffset(Opc)) 598 Result.SOffset = true; 599 600 return Result; 601 } 602 603 if (TII.isMIMG(Opc)) { 604 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 605 if (VAddr0Idx >= 0) { 606 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 607 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 608 } else { 609 Result.VAddr = true; 610 } 611 Result.SRsrc = true; 612 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 613 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 614 Result.SSamp = true; 615 616 return Result; 617 } 618 if (TII.isMTBUF(Opc)) { 619 if (AMDGPU::getMTBUFHasVAddr(Opc)) 620 Result.VAddr = true; 621 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 622 Result.SRsrc = true; 623 if (AMDGPU::getMTBUFHasSoffset(Opc)) 624 Result.SOffset = true; 625 626 return Result; 627 } 628 629 switch (Opc) { 630 default: 631 return Result; 632 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 633 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 634 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 635 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 636 Result.SOffset = true; 637 [[fallthrough]]; 638 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 639 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 640 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 641 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 642 case AMDGPU::S_LOAD_DWORD_IMM: 643 case AMDGPU::S_LOAD_DWORDX2_IMM: 644 case AMDGPU::S_LOAD_DWORDX4_IMM: 645 case AMDGPU::S_LOAD_DWORDX8_IMM: 646 Result.SBase = true; 647 return Result; 648 case AMDGPU::DS_READ_B32: 649 case AMDGPU::DS_READ_B64: 650 case AMDGPU::DS_READ_B32_gfx9: 651 case AMDGPU::DS_READ_B64_gfx9: 652 case AMDGPU::DS_WRITE_B32: 653 case AMDGPU::DS_WRITE_B64: 654 case AMDGPU::DS_WRITE_B32_gfx9: 655 case AMDGPU::DS_WRITE_B64_gfx9: 656 Result.Addr = true; 657 return Result; 658 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 659 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 660 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 661 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 662 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 663 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 664 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 665 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 666 Result.SAddr = true; 667 [[fallthrough]]; 668 case AMDGPU::GLOBAL_LOAD_DWORD: 669 case AMDGPU::GLOBAL_LOAD_DWORDX2: 670 case AMDGPU::GLOBAL_LOAD_DWORDX3: 671 case AMDGPU::GLOBAL_LOAD_DWORDX4: 672 case AMDGPU::GLOBAL_STORE_DWORD: 673 case AMDGPU::GLOBAL_STORE_DWORDX2: 674 case AMDGPU::GLOBAL_STORE_DWORDX3: 675 case AMDGPU::GLOBAL_STORE_DWORDX4: 676 case AMDGPU::FLAT_LOAD_DWORD: 677 case AMDGPU::FLAT_LOAD_DWORDX2: 678 case AMDGPU::FLAT_LOAD_DWORDX3: 679 case AMDGPU::FLAT_LOAD_DWORDX4: 680 case AMDGPU::FLAT_STORE_DWORD: 681 case AMDGPU::FLAT_STORE_DWORDX2: 682 case AMDGPU::FLAT_STORE_DWORDX3: 683 case AMDGPU::FLAT_STORE_DWORDX4: 684 Result.VAddr = true; 685 return Result; 686 } 687 } 688 689 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 690 const SILoadStoreOptimizer &LSO) { 691 I = MI; 692 unsigned Opc = MI->getOpcode(); 693 InstClass = getInstClass(Opc, *LSO.TII); 694 695 if (InstClass == UNKNOWN) 696 return; 697 698 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 699 700 switch (InstClass) { 701 case DS_READ: 702 EltSize = 703 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 704 : 4; 705 break; 706 case DS_WRITE: 707 EltSize = 708 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 709 : 4; 710 break; 711 case S_BUFFER_LOAD_IMM: 712 case S_BUFFER_LOAD_SGPR_IMM: 713 case S_LOAD_IMM: 714 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 715 break; 716 default: 717 EltSize = 4; 718 break; 719 } 720 721 if (InstClass == MIMG) { 722 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 723 // Offset is not considered for MIMG instructions. 724 Offset = 0; 725 } else { 726 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 727 Offset = I->getOperand(OffsetIdx).getImm(); 728 } 729 730 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 731 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 732 733 Width = getOpcodeWidth(*I, *LSO.TII); 734 735 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 736 Offset &= 0xffff; 737 } else if (InstClass != MIMG) { 738 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 739 } 740 741 AddressRegs Regs = getRegs(Opc, *LSO.TII); 742 743 NumAddresses = 0; 744 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 745 AddrIdx[NumAddresses++] = 746 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 747 if (Regs.Addr) 748 AddrIdx[NumAddresses++] = 749 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 750 if (Regs.SBase) 751 AddrIdx[NumAddresses++] = 752 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 753 if (Regs.SRsrc) 754 AddrIdx[NumAddresses++] = 755 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 756 if (Regs.SOffset) 757 AddrIdx[NumAddresses++] = 758 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 759 if (Regs.SAddr) 760 AddrIdx[NumAddresses++] = 761 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 762 if (Regs.VAddr) 763 AddrIdx[NumAddresses++] = 764 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 765 if (Regs.SSamp) 766 AddrIdx[NumAddresses++] = 767 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 768 assert(NumAddresses <= MaxAddressRegs); 769 770 for (unsigned J = 0; J < NumAddresses; J++) 771 AddrReg[J] = &I->getOperand(AddrIdx[J]); 772 } 773 774 } // end anonymous namespace. 775 776 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 777 "SI Load Store Optimizer", false, false) 778 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 779 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 780 false, false) 781 782 char SILoadStoreOptimizer::ID = 0; 783 784 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 785 786 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 787 return new SILoadStoreOptimizer(); 788 } 789 790 static void addDefsUsesToList(const MachineInstr &MI, 791 DenseSet<Register> &RegDefs, 792 DenseSet<Register> &RegUses) { 793 for (const auto &Op : MI.operands()) { 794 if (!Op.isReg()) 795 continue; 796 if (Op.isDef()) 797 RegDefs.insert(Op.getReg()); 798 if (Op.readsReg()) 799 RegUses.insert(Op.getReg()); 800 } 801 } 802 803 bool SILoadStoreOptimizer::canSwapInstructions( 804 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 805 const MachineInstr &A, const MachineInstr &B) const { 806 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 807 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 808 return false; 809 for (const auto &BOp : B.operands()) { 810 if (!BOp.isReg()) 811 continue; 812 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 813 return false; 814 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 815 return false; 816 } 817 return true; 818 } 819 820 // Given that \p CI and \p Paired are adjacent memory operations produce a new 821 // MMO for the combined operation with a new access size. 822 MachineMemOperand * 823 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 824 const CombineInfo &Paired) { 825 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 826 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 827 828 unsigned Size = MMOa->getSize() + MMOb->getSize(); 829 830 // A base pointer for the combined operation is the same as the leading 831 // operation's pointer. 832 if (Paired < CI) 833 std::swap(MMOa, MMOb); 834 835 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 836 // If merging FLAT and GLOBAL set address space to FLAT. 837 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 838 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 839 840 MachineFunction *MF = CI.I->getMF(); 841 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 842 } 843 844 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 845 const SIInstrInfo &TII, 846 const CombineInfo &Paired) { 847 assert(CI.InstClass == MIMG); 848 849 // Ignore instructions with tfe/lwe set. 850 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 851 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 852 853 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 854 return false; 855 856 // Check other optional immediate operands for equality. 857 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 858 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 859 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 860 861 for (auto op : OperandsToMatch) { 862 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 863 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 864 return false; 865 if (Idx != -1 && 866 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 867 return false; 868 } 869 870 // Check DMask for overlaps. 871 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 872 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 873 874 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); 875 if ((1u << AllowedBitsForMin) <= MinMask) 876 return false; 877 878 return true; 879 } 880 881 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 882 unsigned ComponentCount, 883 const GCNSubtarget &STI) { 884 if (ComponentCount > 4) 885 return 0; 886 887 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 888 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 889 if (!OldFormatInfo) 890 return 0; 891 892 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 893 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 894 ComponentCount, 895 OldFormatInfo->NumFormat, STI); 896 897 if (!NewFormatInfo) 898 return 0; 899 900 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 901 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 902 903 return NewFormatInfo->Format; 904 } 905 906 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 907 // highest power of two. Note that the result is well defined for all inputs 908 // including corner cases like: 909 // - if Lo == Hi, return that value 910 // - if Lo == 0, return 0 (even though the "- 1" below underflows 911 // - if Lo > Hi, return 0 (as if the range wrapped around) 912 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 913 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); 914 } 915 916 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 917 const GCNSubtarget &STI, 918 CombineInfo &Paired, 919 bool Modify) { 920 assert(CI.InstClass != MIMG); 921 922 // XXX - Would the same offset be OK? Is there any reason this would happen or 923 // be useful? 924 if (CI.Offset == Paired.Offset) 925 return false; 926 927 // This won't be valid if the offset isn't aligned. 928 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 929 return false; 930 931 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 932 933 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 934 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 935 if (!Info0) 936 return false; 937 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 938 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 939 if (!Info1) 940 return false; 941 942 if (Info0->BitsPerComp != Info1->BitsPerComp || 943 Info0->NumFormat != Info1->NumFormat) 944 return false; 945 946 // TODO: Should be possible to support more formats, but if format loads 947 // are not dword-aligned, the merged load might not be valid. 948 if (Info0->BitsPerComp != 32) 949 return false; 950 951 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 952 return false; 953 } 954 955 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 956 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 957 CI.UseST64 = false; 958 CI.BaseOff = 0; 959 960 // Handle all non-DS instructions. 961 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 962 if (EltOffset0 + CI.Width != EltOffset1 && 963 EltOffset1 + Paired.Width != EltOffset0) 964 return false; 965 if (CI.CPol != Paired.CPol) 966 return false; 967 return true; 968 } 969 970 // If the offset in elements doesn't fit in 8-bits, we might be able to use 971 // the stride 64 versions. 972 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 973 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 974 if (Modify) { 975 CI.Offset = EltOffset0 / 64; 976 Paired.Offset = EltOffset1 / 64; 977 CI.UseST64 = true; 978 } 979 return true; 980 } 981 982 // Check if the new offsets fit in the reduced 8-bit range. 983 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 984 if (Modify) { 985 CI.Offset = EltOffset0; 986 Paired.Offset = EltOffset1; 987 } 988 return true; 989 } 990 991 // Try to shift base address to decrease offsets. 992 uint32_t Min = std::min(EltOffset0, EltOffset1); 993 uint32_t Max = std::max(EltOffset0, EltOffset1); 994 995 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 996 if (((Max - Min) & ~Mask) == 0) { 997 if (Modify) { 998 // From the range of values we could use for BaseOff, choose the one that 999 // is aligned to the highest power of two, to maximise the chance that 1000 // the same offset can be reused for other load/store pairs. 1001 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1002 // Copy the low bits of the offsets, so that when we adjust them by 1003 // subtracting BaseOff they will be multiples of 64. 1004 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1005 CI.BaseOff = BaseOff * CI.EltSize; 1006 CI.Offset = (EltOffset0 - BaseOff) / 64; 1007 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1008 CI.UseST64 = true; 1009 } 1010 return true; 1011 } 1012 1013 if (isUInt<8>(Max - Min)) { 1014 if (Modify) { 1015 // From the range of values we could use for BaseOff, choose the one that 1016 // is aligned to the highest power of two, to maximise the chance that 1017 // the same offset can be reused for other load/store pairs. 1018 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1019 CI.BaseOff = BaseOff * CI.EltSize; 1020 CI.Offset = EltOffset0 - BaseOff; 1021 Paired.Offset = EltOffset1 - BaseOff; 1022 } 1023 return true; 1024 } 1025 1026 return false; 1027 } 1028 1029 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1030 const CombineInfo &CI, 1031 const CombineInfo &Paired) { 1032 const unsigned Width = (CI.Width + Paired.Width); 1033 switch (CI.InstClass) { 1034 default: 1035 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1036 case S_BUFFER_LOAD_IMM: 1037 case S_BUFFER_LOAD_SGPR_IMM: 1038 case S_LOAD_IMM: 1039 switch (Width) { 1040 default: 1041 return false; 1042 case 2: 1043 case 4: 1044 case 8: 1045 return true; 1046 } 1047 } 1048 } 1049 1050 const TargetRegisterClass * 1051 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1052 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1053 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1054 } 1055 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1056 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1057 } 1058 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1059 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1060 } 1061 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1062 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1063 } 1064 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1065 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1066 } 1067 return nullptr; 1068 } 1069 1070 /// This function assumes that CI comes before Paired in a basic block. Return 1071 /// an insertion point for the merged instruction or nullptr on failure. 1072 SILoadStoreOptimizer::CombineInfo * 1073 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1074 CombineInfo &Paired) { 1075 // If another instruction has already been merged into CI, it may now be a 1076 // type that we can't do any further merging into. 1077 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1078 return nullptr; 1079 assert(CI.InstClass == Paired.InstClass); 1080 1081 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1082 getInstSubclass(Paired.I->getOpcode(), *TII)) 1083 return nullptr; 1084 1085 // Check both offsets (or masks for MIMG) can be combined and fit in the 1086 // reduced range. 1087 if (CI.InstClass == MIMG) { 1088 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1089 return nullptr; 1090 } else { 1091 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1092 return nullptr; 1093 } 1094 1095 DenseSet<Register> RegDefs; 1096 DenseSet<Register> RegUses; 1097 CombineInfo *Where; 1098 if (CI.I->mayLoad()) { 1099 // Try to hoist Paired up to CI. 1100 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1101 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1102 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1103 return nullptr; 1104 } 1105 Where = &CI; 1106 } else { 1107 // Try to sink CI down to Paired. 1108 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1109 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1110 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1111 return nullptr; 1112 } 1113 Where = &Paired; 1114 } 1115 1116 // Call offsetsCanBeCombined with modify = true so that the offsets are 1117 // correct for the new instruction. This should return true, because 1118 // this function should only be called on CombineInfo objects that 1119 // have already been confirmed to be mergeable. 1120 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1121 offsetsCanBeCombined(CI, *STM, Paired, true); 1122 return Where; 1123 } 1124 1125 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1126 if (STM->ldsRequiresM0Init()) 1127 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1128 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1129 } 1130 1131 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1132 if (STM->ldsRequiresM0Init()) 1133 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1134 1135 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1136 : AMDGPU::DS_READ2ST64_B64_gfx9; 1137 } 1138 1139 MachineBasicBlock::iterator 1140 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1141 MachineBasicBlock::iterator InsertBefore) { 1142 MachineBasicBlock *MBB = CI.I->getParent(); 1143 1144 // Be careful, since the addresses could be subregisters themselves in weird 1145 // cases, like vectors of pointers. 1146 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1147 1148 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1149 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1150 1151 unsigned NewOffset0 = CI.Offset; 1152 unsigned NewOffset1 = Paired.Offset; 1153 unsigned Opc = 1154 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1155 1156 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1157 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1158 1159 if (NewOffset0 > NewOffset1) { 1160 // Canonicalize the merged instruction so the smaller offset comes first. 1161 std::swap(NewOffset0, NewOffset1); 1162 std::swap(SubRegIdx0, SubRegIdx1); 1163 } 1164 1165 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1166 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1167 1168 const MCInstrDesc &Read2Desc = TII->get(Opc); 1169 1170 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1171 Register DestReg = MRI->createVirtualRegister(SuperRC); 1172 1173 DebugLoc DL = CI.I->getDebugLoc(); 1174 1175 Register BaseReg = AddrReg->getReg(); 1176 unsigned BaseSubReg = AddrReg->getSubReg(); 1177 unsigned BaseRegFlags = 0; 1178 if (CI.BaseOff) { 1179 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1180 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1181 .addImm(CI.BaseOff); 1182 1183 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1184 BaseRegFlags = RegState::Kill; 1185 1186 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1187 .addReg(ImmReg) 1188 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1189 .addImm(0); // clamp bit 1190 BaseSubReg = 0; 1191 } 1192 1193 MachineInstrBuilder Read2 = 1194 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1195 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1196 .addImm(NewOffset0) // offset0 1197 .addImm(NewOffset1) // offset1 1198 .addImm(0) // gds 1199 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1200 1201 (void)Read2; 1202 1203 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1204 1205 // Copy to the old destination registers. 1206 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1207 .add(*Dest0) // Copy to same destination including flags and sub reg. 1208 .addReg(DestReg, 0, SubRegIdx0); 1209 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1210 .add(*Dest1) 1211 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1212 1213 CI.I->eraseFromParent(); 1214 Paired.I->eraseFromParent(); 1215 1216 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1217 return Read2; 1218 } 1219 1220 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1221 if (STM->ldsRequiresM0Init()) 1222 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1223 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1224 : AMDGPU::DS_WRITE2_B64_gfx9; 1225 } 1226 1227 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1228 if (STM->ldsRequiresM0Init()) 1229 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1230 : AMDGPU::DS_WRITE2ST64_B64; 1231 1232 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1233 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1234 } 1235 1236 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1237 CombineInfo &CI, CombineInfo &Paired, 1238 MachineBasicBlock::iterator InsertBefore) { 1239 MachineBasicBlock *MBB = CI.I->getParent(); 1240 1241 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1242 // sure we preserve the subregister index and any register flags set on them. 1243 const MachineOperand *AddrReg = 1244 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1245 const MachineOperand *Data0 = 1246 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1247 const MachineOperand *Data1 = 1248 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1249 1250 unsigned NewOffset0 = CI.Offset; 1251 unsigned NewOffset1 = Paired.Offset; 1252 unsigned Opc = 1253 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1254 1255 if (NewOffset0 > NewOffset1) { 1256 // Canonicalize the merged instruction so the smaller offset comes first. 1257 std::swap(NewOffset0, NewOffset1); 1258 std::swap(Data0, Data1); 1259 } 1260 1261 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1262 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1263 1264 const MCInstrDesc &Write2Desc = TII->get(Opc); 1265 DebugLoc DL = CI.I->getDebugLoc(); 1266 1267 Register BaseReg = AddrReg->getReg(); 1268 unsigned BaseSubReg = AddrReg->getSubReg(); 1269 unsigned BaseRegFlags = 0; 1270 if (CI.BaseOff) { 1271 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1272 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1273 .addImm(CI.BaseOff); 1274 1275 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1276 BaseRegFlags = RegState::Kill; 1277 1278 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1279 .addReg(ImmReg) 1280 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1281 .addImm(0); // clamp bit 1282 BaseSubReg = 0; 1283 } 1284 1285 MachineInstrBuilder Write2 = 1286 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1287 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1288 .add(*Data0) // data0 1289 .add(*Data1) // data1 1290 .addImm(NewOffset0) // offset0 1291 .addImm(NewOffset1) // offset1 1292 .addImm(0) // gds 1293 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1294 1295 CI.I->eraseFromParent(); 1296 Paired.I->eraseFromParent(); 1297 1298 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1299 return Write2; 1300 } 1301 1302 MachineBasicBlock::iterator 1303 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1304 MachineBasicBlock::iterator InsertBefore) { 1305 MachineBasicBlock *MBB = CI.I->getParent(); 1306 DebugLoc DL = CI.I->getDebugLoc(); 1307 const unsigned Opcode = getNewOpcode(CI, Paired); 1308 1309 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1310 1311 Register DestReg = MRI->createVirtualRegister(SuperRC); 1312 unsigned MergedDMask = CI.DMask | Paired.DMask; 1313 unsigned DMaskIdx = 1314 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1315 1316 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1317 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1318 if (I == DMaskIdx) 1319 MIB.addImm(MergedDMask); 1320 else 1321 MIB.add((*CI.I).getOperand(I)); 1322 } 1323 1324 // It shouldn't be possible to get this far if the two instructions 1325 // don't have a single memoperand, because MachineInstr::mayAlias() 1326 // will return true if this is the case. 1327 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1328 1329 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1330 1331 unsigned SubRegIdx0, SubRegIdx1; 1332 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1333 1334 // Copy to the old destination registers. 1335 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1336 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1337 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1338 1339 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1340 .add(*Dest0) // Copy to same destination including flags and sub reg. 1341 .addReg(DestReg, 0, SubRegIdx0); 1342 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1343 .add(*Dest1) 1344 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1345 1346 CI.I->eraseFromParent(); 1347 Paired.I->eraseFromParent(); 1348 return New; 1349 } 1350 1351 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1352 CombineInfo &CI, CombineInfo &Paired, 1353 MachineBasicBlock::iterator InsertBefore) { 1354 MachineBasicBlock *MBB = CI.I->getParent(); 1355 DebugLoc DL = CI.I->getDebugLoc(); 1356 const unsigned Opcode = getNewOpcode(CI, Paired); 1357 1358 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1359 1360 Register DestReg = MRI->createVirtualRegister(SuperRC); 1361 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1362 1363 // It shouldn't be possible to get this far if the two instructions 1364 // don't have a single memoperand, because MachineInstr::mayAlias() 1365 // will return true if this is the case. 1366 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1367 1368 MachineInstrBuilder New = 1369 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1370 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1371 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1372 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1373 New.addImm(MergedOffset); 1374 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1375 1376 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1377 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1378 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1379 1380 // Copy to the old destination registers. 1381 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1382 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1383 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1384 1385 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1386 .add(*Dest0) // Copy to same destination including flags and sub reg. 1387 .addReg(DestReg, 0, SubRegIdx0); 1388 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1389 .add(*Dest1) 1390 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1391 1392 CI.I->eraseFromParent(); 1393 Paired.I->eraseFromParent(); 1394 return New; 1395 } 1396 1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1398 CombineInfo &CI, CombineInfo &Paired, 1399 MachineBasicBlock::iterator InsertBefore) { 1400 MachineBasicBlock *MBB = CI.I->getParent(); 1401 DebugLoc DL = CI.I->getDebugLoc(); 1402 1403 const unsigned Opcode = getNewOpcode(CI, Paired); 1404 1405 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1406 1407 // Copy to the new source register. 1408 Register DestReg = MRI->createVirtualRegister(SuperRC); 1409 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1410 1411 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1412 1413 AddressRegs Regs = getRegs(Opcode, *TII); 1414 1415 if (Regs.VAddr) 1416 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1417 1418 // It shouldn't be possible to get this far if the two instructions 1419 // don't have a single memoperand, because MachineInstr::mayAlias() 1420 // will return true if this is the case. 1421 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1422 1423 MachineInstr *New = 1424 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1425 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1426 .addImm(MergedOffset) // offset 1427 .addImm(CI.CPol) // cpol 1428 .addImm(0) // swz 1429 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1430 1431 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1432 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1433 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1434 1435 // Copy to the old destination registers. 1436 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1437 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1438 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1439 1440 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1441 .add(*Dest0) // Copy to same destination including flags and sub reg. 1442 .addReg(DestReg, 0, SubRegIdx0); 1443 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1444 .add(*Dest1) 1445 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1446 1447 CI.I->eraseFromParent(); 1448 Paired.I->eraseFromParent(); 1449 return New; 1450 } 1451 1452 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1453 CombineInfo &CI, CombineInfo &Paired, 1454 MachineBasicBlock::iterator InsertBefore) { 1455 MachineBasicBlock *MBB = CI.I->getParent(); 1456 DebugLoc DL = CI.I->getDebugLoc(); 1457 1458 const unsigned Opcode = getNewOpcode(CI, Paired); 1459 1460 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1461 1462 // Copy to the new source register. 1463 Register DestReg = MRI->createVirtualRegister(SuperRC); 1464 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1465 1466 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1467 1468 AddressRegs Regs = getRegs(Opcode, *TII); 1469 1470 if (Regs.VAddr) 1471 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1472 1473 unsigned JoinedFormat = 1474 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1475 1476 // It shouldn't be possible to get this far if the two instructions 1477 // don't have a single memoperand, because MachineInstr::mayAlias() 1478 // will return true if this is the case. 1479 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1480 1481 MachineInstr *New = 1482 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1483 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1484 .addImm(MergedOffset) // offset 1485 .addImm(JoinedFormat) // format 1486 .addImm(CI.CPol) // cpol 1487 .addImm(0) // swz 1488 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1489 1490 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1491 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1492 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1493 1494 // Copy to the old destination registers. 1495 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1496 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1497 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1498 1499 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1500 .add(*Dest0) // Copy to same destination including flags and sub reg. 1501 .addReg(DestReg, 0, SubRegIdx0); 1502 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1503 .add(*Dest1) 1504 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1505 1506 CI.I->eraseFromParent(); 1507 Paired.I->eraseFromParent(); 1508 return New; 1509 } 1510 1511 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1512 CombineInfo &CI, CombineInfo &Paired, 1513 MachineBasicBlock::iterator InsertBefore) { 1514 MachineBasicBlock *MBB = CI.I->getParent(); 1515 DebugLoc DL = CI.I->getDebugLoc(); 1516 1517 const unsigned Opcode = getNewOpcode(CI, Paired); 1518 1519 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1520 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1521 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1522 1523 // Copy to the new source register. 1524 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1525 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1526 1527 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1528 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1529 1530 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1531 .add(*Src0) 1532 .addImm(SubRegIdx0) 1533 .add(*Src1) 1534 .addImm(SubRegIdx1); 1535 1536 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1537 .addReg(SrcReg, RegState::Kill); 1538 1539 AddressRegs Regs = getRegs(Opcode, *TII); 1540 1541 if (Regs.VAddr) 1542 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1543 1544 unsigned JoinedFormat = 1545 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1546 1547 // It shouldn't be possible to get this far if the two instructions 1548 // don't have a single memoperand, because MachineInstr::mayAlias() 1549 // will return true if this is the case. 1550 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1551 1552 MachineInstr *New = 1553 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1554 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1555 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1556 .addImm(JoinedFormat) // format 1557 .addImm(CI.CPol) // cpol 1558 .addImm(0) // swz 1559 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1560 1561 CI.I->eraseFromParent(); 1562 Paired.I->eraseFromParent(); 1563 return New; 1564 } 1565 1566 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1567 CombineInfo &CI, CombineInfo &Paired, 1568 MachineBasicBlock::iterator InsertBefore) { 1569 MachineBasicBlock *MBB = CI.I->getParent(); 1570 DebugLoc DL = CI.I->getDebugLoc(); 1571 1572 const unsigned Opcode = getNewOpcode(CI, Paired); 1573 1574 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1575 Register DestReg = MRI->createVirtualRegister(SuperRC); 1576 1577 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1578 1579 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1580 MIB.add(*SAddr); 1581 1582 MachineInstr *New = 1583 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1584 .addImm(std::min(CI.Offset, Paired.Offset)) 1585 .addImm(CI.CPol) 1586 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1587 1588 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1589 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1590 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1591 1592 // Copy to the old destination registers. 1593 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1594 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1595 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1596 1597 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1598 .add(*Dest0) // Copy to same destination including flags and sub reg. 1599 .addReg(DestReg, 0, SubRegIdx0); 1600 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1601 .add(*Dest1) 1602 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1603 1604 CI.I->eraseFromParent(); 1605 Paired.I->eraseFromParent(); 1606 return New; 1607 } 1608 1609 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1610 CombineInfo &CI, CombineInfo &Paired, 1611 MachineBasicBlock::iterator InsertBefore) { 1612 MachineBasicBlock *MBB = CI.I->getParent(); 1613 DebugLoc DL = CI.I->getDebugLoc(); 1614 1615 const unsigned Opcode = getNewOpcode(CI, Paired); 1616 1617 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1618 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1619 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1620 1621 // Copy to the new source register. 1622 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1623 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1624 1625 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1626 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1627 1628 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1629 .add(*Src0) 1630 .addImm(SubRegIdx0) 1631 .add(*Src1) 1632 .addImm(SubRegIdx1); 1633 1634 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1635 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1636 .addReg(SrcReg, RegState::Kill); 1637 1638 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1639 MIB.add(*SAddr); 1640 1641 MachineInstr *New = 1642 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1643 .addImm(CI.CPol) 1644 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1645 1646 CI.I->eraseFromParent(); 1647 Paired.I->eraseFromParent(); 1648 return New; 1649 } 1650 1651 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1652 const CombineInfo &Paired) { 1653 const unsigned Width = CI.Width + Paired.Width; 1654 1655 switch (getCommonInstClass(CI, Paired)) { 1656 default: 1657 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1658 // FIXME: Handle d16 correctly 1659 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1660 Width); 1661 case TBUFFER_LOAD: 1662 case TBUFFER_STORE: 1663 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1664 Width); 1665 1666 case UNKNOWN: 1667 llvm_unreachable("Unknown instruction class"); 1668 case S_BUFFER_LOAD_IMM: 1669 switch (Width) { 1670 default: 1671 return 0; 1672 case 2: 1673 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1674 case 4: 1675 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1676 case 8: 1677 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1678 } 1679 case S_BUFFER_LOAD_SGPR_IMM: 1680 switch (Width) { 1681 default: 1682 return 0; 1683 case 2: 1684 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1685 case 4: 1686 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1687 case 8: 1688 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1689 } 1690 case S_LOAD_IMM: 1691 switch (Width) { 1692 default: 1693 return 0; 1694 case 2: 1695 return AMDGPU::S_LOAD_DWORDX2_IMM; 1696 case 4: 1697 return AMDGPU::S_LOAD_DWORDX4_IMM; 1698 case 8: 1699 return AMDGPU::S_LOAD_DWORDX8_IMM; 1700 } 1701 case GLOBAL_LOAD: 1702 switch (Width) { 1703 default: 1704 return 0; 1705 case 2: 1706 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1707 case 3: 1708 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1709 case 4: 1710 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1711 } 1712 case GLOBAL_LOAD_SADDR: 1713 switch (Width) { 1714 default: 1715 return 0; 1716 case 2: 1717 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1718 case 3: 1719 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1720 case 4: 1721 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1722 } 1723 case GLOBAL_STORE: 1724 switch (Width) { 1725 default: 1726 return 0; 1727 case 2: 1728 return AMDGPU::GLOBAL_STORE_DWORDX2; 1729 case 3: 1730 return AMDGPU::GLOBAL_STORE_DWORDX3; 1731 case 4: 1732 return AMDGPU::GLOBAL_STORE_DWORDX4; 1733 } 1734 case GLOBAL_STORE_SADDR: 1735 switch (Width) { 1736 default: 1737 return 0; 1738 case 2: 1739 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1740 case 3: 1741 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1742 case 4: 1743 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1744 } 1745 case FLAT_LOAD: 1746 switch (Width) { 1747 default: 1748 return 0; 1749 case 2: 1750 return AMDGPU::FLAT_LOAD_DWORDX2; 1751 case 3: 1752 return AMDGPU::FLAT_LOAD_DWORDX3; 1753 case 4: 1754 return AMDGPU::FLAT_LOAD_DWORDX4; 1755 } 1756 case FLAT_STORE: 1757 switch (Width) { 1758 default: 1759 return 0; 1760 case 2: 1761 return AMDGPU::FLAT_STORE_DWORDX2; 1762 case 3: 1763 return AMDGPU::FLAT_STORE_DWORDX3; 1764 case 4: 1765 return AMDGPU::FLAT_STORE_DWORDX4; 1766 } 1767 case MIMG: 1768 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1769 "No overlaps"); 1770 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1771 } 1772 } 1773 1774 std::pair<unsigned, unsigned> 1775 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1776 const CombineInfo &Paired) { 1777 assert((CI.InstClass != MIMG || 1778 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1779 CI.Width + Paired.Width)) && 1780 "No overlaps"); 1781 1782 unsigned Idx0; 1783 unsigned Idx1; 1784 1785 static const unsigned Idxs[5][4] = { 1786 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1787 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1788 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1789 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1790 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1791 }; 1792 1793 assert(CI.Width >= 1 && CI.Width <= 4); 1794 assert(Paired.Width >= 1 && Paired.Width <= 4); 1795 1796 if (Paired < CI) { 1797 Idx1 = Idxs[0][Paired.Width - 1]; 1798 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1799 } else { 1800 Idx0 = Idxs[0][CI.Width - 1]; 1801 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1802 } 1803 1804 return std::pair(Idx0, Idx1); 1805 } 1806 1807 const TargetRegisterClass * 1808 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1809 const CombineInfo &Paired) { 1810 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1811 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1812 switch (CI.Width + Paired.Width) { 1813 default: 1814 return nullptr; 1815 case 2: 1816 return &AMDGPU::SReg_64_XEXECRegClass; 1817 case 4: 1818 return &AMDGPU::SGPR_128RegClass; 1819 case 8: 1820 return &AMDGPU::SGPR_256RegClass; 1821 case 16: 1822 return &AMDGPU::SGPR_512RegClass; 1823 } 1824 } 1825 1826 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1827 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1828 ? TRI->getAGPRClassForBitWidth(BitWidth) 1829 : TRI->getVGPRClassForBitWidth(BitWidth); 1830 } 1831 1832 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1833 CombineInfo &CI, CombineInfo &Paired, 1834 MachineBasicBlock::iterator InsertBefore) { 1835 MachineBasicBlock *MBB = CI.I->getParent(); 1836 DebugLoc DL = CI.I->getDebugLoc(); 1837 1838 const unsigned Opcode = getNewOpcode(CI, Paired); 1839 1840 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1841 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1842 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1843 1844 // Copy to the new source register. 1845 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1846 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1847 1848 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1849 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1850 1851 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1852 .add(*Src0) 1853 .addImm(SubRegIdx0) 1854 .add(*Src1) 1855 .addImm(SubRegIdx1); 1856 1857 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1858 .addReg(SrcReg, RegState::Kill); 1859 1860 AddressRegs Regs = getRegs(Opcode, *TII); 1861 1862 if (Regs.VAddr) 1863 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1864 1865 1866 // It shouldn't be possible to get this far if the two instructions 1867 // don't have a single memoperand, because MachineInstr::mayAlias() 1868 // will return true if this is the case. 1869 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1870 1871 MachineInstr *New = 1872 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1873 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1874 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1875 .addImm(CI.CPol) // cpol 1876 .addImm(0) // swz 1877 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1878 1879 CI.I->eraseFromParent(); 1880 Paired.I->eraseFromParent(); 1881 return New; 1882 } 1883 1884 MachineOperand 1885 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1886 APInt V(32, Val, true); 1887 if (TII->isInlineConstant(V)) 1888 return MachineOperand::CreateImm(Val); 1889 1890 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1891 MachineInstr *Mov = 1892 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1893 TII->get(AMDGPU::S_MOV_B32), Reg) 1894 .addImm(Val); 1895 (void)Mov; 1896 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1897 return MachineOperand::CreateReg(Reg, false); 1898 } 1899 1900 // Compute base address using Addr and return the final register. 1901 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1902 const MemAddress &Addr) const { 1903 MachineBasicBlock *MBB = MI.getParent(); 1904 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1905 DebugLoc DL = MI.getDebugLoc(); 1906 1907 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1908 Addr.Base.LoSubReg) && 1909 "Expected 32-bit Base-Register-Low!!"); 1910 1911 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1912 Addr.Base.HiSubReg) && 1913 "Expected 32-bit Base-Register-Hi!!"); 1914 1915 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1916 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1917 MachineOperand OffsetHi = 1918 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1919 1920 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1921 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1922 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1923 1924 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1925 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1926 MachineInstr *LoHalf = 1927 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1928 .addReg(CarryReg, RegState::Define) 1929 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1930 .add(OffsetLo) 1931 .addImm(0); // clamp bit 1932 (void)LoHalf; 1933 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1934 1935 MachineInstr *HiHalf = 1936 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1937 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1938 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1939 .add(OffsetHi) 1940 .addReg(CarryReg, RegState::Kill) 1941 .addImm(0); // clamp bit 1942 (void)HiHalf; 1943 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1944 1945 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1946 MachineInstr *FullBase = 1947 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1948 .addReg(DestSub0) 1949 .addImm(AMDGPU::sub0) 1950 .addReg(DestSub1) 1951 .addImm(AMDGPU::sub1); 1952 (void)FullBase; 1953 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1954 1955 return FullDestReg; 1956 } 1957 1958 // Update base and offset with the NewBase and NewOffset in MI. 1959 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1960 Register NewBase, 1961 int32_t NewOffset) const { 1962 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1963 Base->setReg(NewBase); 1964 Base->setIsKill(false); 1965 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1966 } 1967 1968 std::optional<int32_t> 1969 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1970 if (Op.isImm()) 1971 return Op.getImm(); 1972 1973 if (!Op.isReg()) 1974 return std::nullopt; 1975 1976 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1977 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1978 !Def->getOperand(1).isImm()) 1979 return std::nullopt; 1980 1981 return Def->getOperand(1).getImm(); 1982 } 1983 1984 // Analyze Base and extracts: 1985 // - 32bit base registers, subregisters 1986 // - 64bit constant offset 1987 // Expecting base computation as: 1988 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1989 // %LO:vgpr_32, %c:sreg_64_xexec = 1990 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1991 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1992 // %Base:vreg_64 = 1993 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1994 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1995 MemAddress &Addr) const { 1996 if (!Base.isReg()) 1997 return; 1998 1999 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2000 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2001 || Def->getNumOperands() != 5) 2002 return; 2003 2004 MachineOperand BaseLo = Def->getOperand(1); 2005 MachineOperand BaseHi = Def->getOperand(3); 2006 if (!BaseLo.isReg() || !BaseHi.isReg()) 2007 return; 2008 2009 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2010 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2011 2012 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2013 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2014 return; 2015 2016 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2017 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2018 2019 auto Offset0P = extractConstOffset(*Src0); 2020 if (Offset0P) 2021 BaseLo = *Src1; 2022 else { 2023 if (!(Offset0P = extractConstOffset(*Src1))) 2024 return; 2025 BaseLo = *Src0; 2026 } 2027 2028 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2029 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2030 2031 if (Src0->isImm()) 2032 std::swap(Src0, Src1); 2033 2034 if (!Src1->isImm()) 2035 return; 2036 2037 uint64_t Offset1 = Src1->getImm(); 2038 BaseHi = *Src0; 2039 2040 Addr.Base.LoReg = BaseLo.getReg(); 2041 Addr.Base.HiReg = BaseHi.getReg(); 2042 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2043 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2044 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2045 } 2046 2047 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2048 MachineInstr &MI, 2049 MemInfoMap &Visited, 2050 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2051 2052 if (!(MI.mayLoad() ^ MI.mayStore())) 2053 return false; 2054 2055 // TODO: Support flat and scratch. 2056 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2057 return false; 2058 2059 if (MI.mayLoad() && 2060 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2061 return false; 2062 2063 if (AnchorList.count(&MI)) 2064 return false; 2065 2066 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2067 2068 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2069 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2070 return false; 2071 } 2072 2073 // Step1: Find the base-registers and a 64bit constant offset. 2074 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2075 MemAddress MAddr; 2076 if (!Visited.contains(&MI)) { 2077 processBaseWithConstOffset(Base, MAddr); 2078 Visited[&MI] = MAddr; 2079 } else 2080 MAddr = Visited[&MI]; 2081 2082 if (MAddr.Offset == 0) { 2083 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2084 " constant offsets that can be promoted.\n";); 2085 return false; 2086 } 2087 2088 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2089 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2090 2091 // Step2: Traverse through MI's basic block and find an anchor(that has the 2092 // same base-registers) with the highest 13bit distance from MI's offset. 2093 // E.g. (64bit loads) 2094 // bb: 2095 // addr1 = &a + 4096; load1 = load(addr1, 0) 2096 // addr2 = &a + 6144; load2 = load(addr2, 0) 2097 // addr3 = &a + 8192; load3 = load(addr3, 0) 2098 // addr4 = &a + 10240; load4 = load(addr4, 0) 2099 // addr5 = &a + 12288; load5 = load(addr5, 0) 2100 // 2101 // Starting from the first load, the optimization will try to find a new base 2102 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2103 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2104 // as the new-base(anchor) because of the maximum distance which can 2105 // accommodate more intermediate bases presumably. 2106 // 2107 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2108 // (&a + 8192) for load1, load2, load4. 2109 // addr = &a + 8192 2110 // load1 = load(addr, -4096) 2111 // load2 = load(addr, -2048) 2112 // load3 = load(addr, 0) 2113 // load4 = load(addr, 2048) 2114 // addr5 = &a + 12288; load5 = load(addr5, 0) 2115 // 2116 MachineInstr *AnchorInst = nullptr; 2117 MemAddress AnchorAddr; 2118 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2119 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2120 2121 MachineBasicBlock *MBB = MI.getParent(); 2122 MachineBasicBlock::iterator E = MBB->end(); 2123 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2124 ++MBBI; 2125 const SITargetLowering *TLI = 2126 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2127 2128 for ( ; MBBI != E; ++MBBI) { 2129 MachineInstr &MINext = *MBBI; 2130 // TODO: Support finding an anchor(with same base) from store addresses or 2131 // any other load addresses where the opcodes are different. 2132 if (MINext.getOpcode() != MI.getOpcode() || 2133 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2134 continue; 2135 2136 const MachineOperand &BaseNext = 2137 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2138 MemAddress MAddrNext; 2139 if (!Visited.contains(&MINext)) { 2140 processBaseWithConstOffset(BaseNext, MAddrNext); 2141 Visited[&MINext] = MAddrNext; 2142 } else 2143 MAddrNext = Visited[&MINext]; 2144 2145 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2146 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2147 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2148 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2149 continue; 2150 2151 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2152 2153 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2154 TargetLoweringBase::AddrMode AM; 2155 AM.HasBaseReg = true; 2156 AM.BaseOffs = Dist; 2157 if (TLI->isLegalGlobalAddressingMode(AM) && 2158 (uint32_t)std::abs(Dist) > MaxDist) { 2159 MaxDist = std::abs(Dist); 2160 2161 AnchorAddr = MAddrNext; 2162 AnchorInst = &MINext; 2163 } 2164 } 2165 2166 if (AnchorInst) { 2167 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2168 AnchorInst->dump()); 2169 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2170 << AnchorAddr.Offset << "\n\n"); 2171 2172 // Instead of moving up, just re-compute anchor-instruction's base address. 2173 Register Base = computeBase(MI, AnchorAddr); 2174 2175 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2176 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2177 2178 for (auto P : InstsWCommonBase) { 2179 TargetLoweringBase::AddrMode AM; 2180 AM.HasBaseReg = true; 2181 AM.BaseOffs = P.second - AnchorAddr.Offset; 2182 2183 if (TLI->isLegalGlobalAddressingMode(AM)) { 2184 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2185 dbgs() << ")"; P.first->dump()); 2186 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2187 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2188 } 2189 } 2190 AnchorList.insert(AnchorInst); 2191 return true; 2192 } 2193 2194 return false; 2195 } 2196 2197 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2198 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2199 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2200 if (AddrList.front().InstClass == CI.InstClass && 2201 AddrList.front().IsAGPR == CI.IsAGPR && 2202 AddrList.front().hasSameBaseAddress(CI)) { 2203 AddrList.emplace_back(CI); 2204 return; 2205 } 2206 } 2207 2208 // Base address not found, so add a new list. 2209 MergeableInsts.emplace_back(1, CI); 2210 } 2211 2212 std::pair<MachineBasicBlock::iterator, bool> 2213 SILoadStoreOptimizer::collectMergeableInsts( 2214 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2215 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2216 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2217 bool Modified = false; 2218 2219 // Sort potential mergeable instructions into lists. One list per base address. 2220 unsigned Order = 0; 2221 MachineBasicBlock::iterator BlockI = Begin; 2222 for (; BlockI != End; ++BlockI) { 2223 MachineInstr &MI = *BlockI; 2224 2225 // We run this before checking if an address is mergeable, because it can produce 2226 // better code even if the instructions aren't mergeable. 2227 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2228 Modified = true; 2229 2230 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2231 // barriers. We can look after this barrier for separate merges. 2232 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2233 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2234 2235 // Search will resume after this instruction in a separate merge list. 2236 ++BlockI; 2237 break; 2238 } 2239 2240 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2241 if (InstClass == UNKNOWN) 2242 continue; 2243 2244 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2245 int Swizzled = 2246 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2247 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2248 continue; 2249 2250 CombineInfo CI; 2251 CI.setMI(MI, *this); 2252 CI.Order = Order++; 2253 2254 if (!CI.hasMergeableAddress(*MRI)) 2255 continue; 2256 2257 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2258 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2259 // operands. However we are reporting that ds_write2 shall have 2260 // only VGPR data so that machine copy propagation does not 2261 // create an illegal instruction with a VGPR and AGPR sources. 2262 // Consequenctially if we create such instruction the verifier 2263 // will complain. 2264 continue; 2265 } 2266 2267 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2268 2269 addInstToMergeableList(CI, MergeableInsts); 2270 } 2271 2272 // At this point we have lists of Mergeable instructions. 2273 // 2274 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2275 // list try to find an instruction that can be merged with I. If an instruction 2276 // is found, it is stored in the Paired field. If no instructions are found, then 2277 // the CombineInfo object is deleted from the list. 2278 2279 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2280 E = MergeableInsts.end(); I != E;) { 2281 2282 std::list<CombineInfo> &MergeList = *I; 2283 if (MergeList.size() <= 1) { 2284 // This means we have found only one instruction with a given address 2285 // that can be merged, and we need at least 2 instructions to do a merge, 2286 // so this list can be discarded. 2287 I = MergeableInsts.erase(I); 2288 continue; 2289 } 2290 2291 // Sort the lists by offsets, this way mergeable instructions will be 2292 // adjacent to each other in the list, which will make it easier to find 2293 // matches. 2294 MergeList.sort( 2295 [] (const CombineInfo &A, const CombineInfo &B) { 2296 return A.Offset < B.Offset; 2297 }); 2298 ++I; 2299 } 2300 2301 return std::pair(BlockI, Modified); 2302 } 2303 2304 // Scan through looking for adjacent LDS operations with constant offsets from 2305 // the same base register. We rely on the scheduler to do the hard work of 2306 // clustering nearby loads, and assume these are all adjacent. 2307 bool SILoadStoreOptimizer::optimizeBlock( 2308 std::list<std::list<CombineInfo> > &MergeableInsts) { 2309 bool Modified = false; 2310 2311 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2312 E = MergeableInsts.end(); I != E;) { 2313 std::list<CombineInfo> &MergeList = *I; 2314 2315 bool OptimizeListAgain = false; 2316 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2317 // We weren't able to make any changes, so delete the list so we don't 2318 // process the same instructions the next time we try to optimize this 2319 // block. 2320 I = MergeableInsts.erase(I); 2321 continue; 2322 } 2323 2324 Modified = true; 2325 2326 // We made changes, but also determined that there were no more optimization 2327 // opportunities, so we don't need to reprocess the list 2328 if (!OptimizeListAgain) { 2329 I = MergeableInsts.erase(I); 2330 continue; 2331 } 2332 OptimizeAgain = true; 2333 } 2334 return Modified; 2335 } 2336 2337 bool 2338 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2339 std::list<CombineInfo> &MergeList, 2340 bool &OptimizeListAgain) { 2341 if (MergeList.empty()) 2342 return false; 2343 2344 bool Modified = false; 2345 2346 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2347 Next = std::next(I)) { 2348 2349 auto First = I; 2350 auto Second = Next; 2351 2352 if ((*First).Order > (*Second).Order) 2353 std::swap(First, Second); 2354 CombineInfo &CI = *First; 2355 CombineInfo &Paired = *Second; 2356 2357 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2358 if (!Where) { 2359 ++I; 2360 continue; 2361 } 2362 2363 Modified = true; 2364 2365 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2366 2367 MachineBasicBlock::iterator NewMI; 2368 switch (CI.InstClass) { 2369 default: 2370 llvm_unreachable("unknown InstClass"); 2371 break; 2372 case DS_READ: 2373 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2374 break; 2375 case DS_WRITE: 2376 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2377 break; 2378 case S_BUFFER_LOAD_IMM: 2379 case S_BUFFER_LOAD_SGPR_IMM: 2380 case S_LOAD_IMM: 2381 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2382 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2383 break; 2384 case BUFFER_LOAD: 2385 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2386 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2387 break; 2388 case BUFFER_STORE: 2389 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2390 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2391 break; 2392 case MIMG: 2393 NewMI = mergeImagePair(CI, Paired, Where->I); 2394 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2395 break; 2396 case TBUFFER_LOAD: 2397 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2398 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2399 break; 2400 case TBUFFER_STORE: 2401 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2402 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2403 break; 2404 case FLAT_LOAD: 2405 case GLOBAL_LOAD: 2406 case GLOBAL_LOAD_SADDR: 2407 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2408 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2409 break; 2410 case FLAT_STORE: 2411 case GLOBAL_STORE: 2412 case GLOBAL_STORE_SADDR: 2413 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2414 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2415 break; 2416 } 2417 CI.setMI(NewMI, *this); 2418 CI.Order = Where->Order; 2419 if (I == Second) 2420 I = Next; 2421 2422 MergeList.erase(Second); 2423 } 2424 2425 return Modified; 2426 } 2427 2428 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2429 if (skipFunction(MF.getFunction())) 2430 return false; 2431 2432 STM = &MF.getSubtarget<GCNSubtarget>(); 2433 if (!STM->loadStoreOptEnabled()) 2434 return false; 2435 2436 TII = STM->getInstrInfo(); 2437 TRI = &TII->getRegisterInfo(); 2438 2439 MRI = &MF.getRegInfo(); 2440 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2441 2442 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2443 2444 bool Modified = false; 2445 2446 // Contains the list of instructions for which constant offsets are being 2447 // promoted to the IMM. This is tracked for an entire block at time. 2448 SmallPtrSet<MachineInstr *, 4> AnchorList; 2449 MemInfoMap Visited; 2450 2451 for (MachineBasicBlock &MBB : MF) { 2452 MachineBasicBlock::iterator SectionEnd; 2453 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2454 I = SectionEnd) { 2455 bool CollectModified; 2456 std::list<std::list<CombineInfo>> MergeableInsts; 2457 2458 // First pass: Collect list of all instructions we know how to merge in a 2459 // subset of the block. 2460 std::tie(SectionEnd, CollectModified) = 2461 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2462 2463 Modified |= CollectModified; 2464 2465 do { 2466 OptimizeAgain = false; 2467 Modified |= optimizeBlock(MergeableInsts); 2468 } while (OptimizeAgain); 2469 } 2470 2471 Visited.clear(); 2472 AnchorList.clear(); 2473 } 2474 2475 return Modified; 2476 } 2477