1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 S_BUFFER_LOAD_SGPR_IMM, 78 S_LOAD_IMM, 79 BUFFER_LOAD, 80 BUFFER_STORE, 81 MIMG, 82 TBUFFER_LOAD, 83 TBUFFER_STORE, 84 GLOBAL_LOAD_SADDR, 85 GLOBAL_STORE_SADDR, 86 FLAT_LOAD, 87 FLAT_STORE, 88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 89 GLOBAL_STORE // any CombineInfo, they are only ever returned by 90 // getCommonInstClass. 91 }; 92 93 struct AddressRegs { 94 unsigned char NumVAddrs = 0; 95 bool SBase = false; 96 bool SRsrc = false; 97 bool SOffset = false; 98 bool SAddr = false; 99 bool VAddr = false; 100 bool Addr = false; 101 bool SSamp = false; 102 }; 103 104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 105 const unsigned MaxAddressRegs = 12 + 1 + 1; 106 107 class SILoadStoreOptimizer : public MachineFunctionPass { 108 struct CombineInfo { 109 MachineBasicBlock::iterator I; 110 unsigned EltSize; 111 unsigned Offset; 112 unsigned Width; 113 unsigned Format; 114 unsigned BaseOff; 115 unsigned DMask; 116 InstClassEnum InstClass; 117 unsigned CPol = 0; 118 bool IsAGPR; 119 bool UseST64; 120 int AddrIdx[MaxAddressRegs]; 121 const MachineOperand *AddrReg[MaxAddressRegs]; 122 unsigned NumAddresses; 123 unsigned Order; 124 125 bool hasSameBaseAddress(const CombineInfo &CI) { 126 if (NumAddresses != CI.NumAddresses) 127 return false; 128 129 const MachineInstr &MI = *CI.I; 130 for (unsigned i = 0; i < NumAddresses; i++) { 131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 132 133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 135 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 136 return false; 137 } 138 continue; 139 } 140 141 // Check same base pointer. Be careful of subregisters, which can occur 142 // with vectors of pointers. 143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 145 return false; 146 } 147 } 148 return true; 149 } 150 151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 152 for (unsigned i = 0; i < NumAddresses; ++i) { 153 const MachineOperand *AddrOp = AddrReg[i]; 154 // Immediates are always OK. 155 if (AddrOp->isImm()) 156 continue; 157 158 // Don't try to merge addresses that aren't either immediates or registers. 159 // TODO: Should be possible to merge FrameIndexes and maybe some other 160 // non-register 161 if (!AddrOp->isReg()) 162 return false; 163 164 // TODO: We should be able to merge physical reg addresses. 165 if (AddrOp->getReg().isPhysical()) 166 return false; 167 168 // If an address has only one use then there will be no other 169 // instructions with the same address, so we can't merge this one. 170 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 171 return false; 172 } 173 return true; 174 } 175 176 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 177 178 // Compare by pointer order. 179 bool operator<(const CombineInfo& Other) const { 180 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 181 } 182 }; 183 184 struct BaseRegisters { 185 Register LoReg; 186 Register HiReg; 187 188 unsigned LoSubReg = 0; 189 unsigned HiSubReg = 0; 190 }; 191 192 struct MemAddress { 193 BaseRegisters Base; 194 int64_t Offset = 0; 195 }; 196 197 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 198 199 private: 200 const GCNSubtarget *STM = nullptr; 201 const SIInstrInfo *TII = nullptr; 202 const SIRegisterInfo *TRI = nullptr; 203 MachineRegisterInfo *MRI = nullptr; 204 AliasAnalysis *AA = nullptr; 205 bool OptimizeAgain; 206 207 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 208 const DenseSet<Register> &ARegUses, 209 const MachineInstr &A, const MachineInstr &B) const; 210 static bool dmasksCanBeCombined(const CombineInfo &CI, 211 const SIInstrInfo &TII, 212 const CombineInfo &Paired); 213 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 214 CombineInfo &Paired, bool Modify = false); 215 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 216 const CombineInfo &Paired); 217 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 218 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 219 const CombineInfo &Paired); 220 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 223 224 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 225 226 unsigned read2Opcode(unsigned EltSize) const; 227 unsigned read2ST64Opcode(unsigned EltSize) const; 228 MachineBasicBlock::iterator 229 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 232 unsigned write2Opcode(unsigned EltSize) const; 233 unsigned write2ST64Opcode(unsigned EltSize) const; 234 MachineBasicBlock::iterator 235 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 MachineBasicBlock::iterator 241 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 242 MachineBasicBlock::iterator InsertBefore); 243 MachineBasicBlock::iterator 244 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 245 MachineBasicBlock::iterator InsertBefore); 246 MachineBasicBlock::iterator 247 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 248 MachineBasicBlock::iterator InsertBefore); 249 MachineBasicBlock::iterator 250 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 251 MachineBasicBlock::iterator InsertBefore); 252 MachineBasicBlock::iterator 253 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 254 MachineBasicBlock::iterator InsertBefore); 255 MachineBasicBlock::iterator 256 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 257 MachineBasicBlock::iterator InsertBefore); 258 MachineBasicBlock::iterator 259 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 260 MachineBasicBlock::iterator InsertBefore); 261 262 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 263 int32_t NewOffset) const; 264 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 265 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 266 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 267 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 268 /// Promotes constant offset to the immediate by adjusting the base. It 269 /// tries to use a base from the nearby instructions that allows it to have 270 /// a 13bit constant offset which gets promoted to the immediate. 271 bool promoteConstantOffsetToImm(MachineInstr &CI, 272 MemInfoMap &Visited, 273 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 274 void addInstToMergeableList(const CombineInfo &CI, 275 std::list<std::list<CombineInfo> > &MergeableInsts) const; 276 277 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 278 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 279 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 280 std::list<std::list<CombineInfo>> &MergeableInsts) const; 281 282 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 283 const CombineInfo &Paired); 284 285 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 286 const CombineInfo &Paired); 287 288 public: 289 static char ID; 290 291 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 292 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 293 } 294 295 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 296 bool &OptimizeListAgain); 297 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 298 299 bool runOnMachineFunction(MachineFunction &MF) override; 300 301 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 302 303 void getAnalysisUsage(AnalysisUsage &AU) const override { 304 AU.setPreservesCFG(); 305 AU.addRequired<AAResultsWrapperPass>(); 306 307 MachineFunctionPass::getAnalysisUsage(AU); 308 } 309 310 MachineFunctionProperties getRequiredProperties() const override { 311 return MachineFunctionProperties() 312 .set(MachineFunctionProperties::Property::IsSSA); 313 } 314 }; 315 316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 317 const unsigned Opc = MI.getOpcode(); 318 319 if (TII.isMUBUF(Opc)) { 320 // FIXME: Handle d16 correctly 321 return AMDGPU::getMUBUFElements(Opc); 322 } 323 if (TII.isMIMG(MI)) { 324 uint64_t DMaskImm = 325 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 326 return llvm::popcount(DMaskImm); 327 } 328 if (TII.isMTBUF(Opc)) { 329 return AMDGPU::getMTBUFElements(Opc); 330 } 331 332 switch (Opc) { 333 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 334 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 335 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 336 case AMDGPU::S_LOAD_DWORD_IMM: 337 case AMDGPU::GLOBAL_LOAD_DWORD: 338 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 339 case AMDGPU::GLOBAL_STORE_DWORD: 340 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 341 case AMDGPU::FLAT_LOAD_DWORD: 342 case AMDGPU::FLAT_STORE_DWORD: 343 return 1; 344 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 347 case AMDGPU::S_LOAD_DWORDX2_IMM: 348 case AMDGPU::GLOBAL_LOAD_DWORDX2: 349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 350 case AMDGPU::GLOBAL_STORE_DWORDX2: 351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 352 case AMDGPU::FLAT_LOAD_DWORDX2: 353 case AMDGPU::FLAT_STORE_DWORDX2: 354 return 2; 355 case AMDGPU::GLOBAL_LOAD_DWORDX3: 356 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 357 case AMDGPU::GLOBAL_STORE_DWORDX3: 358 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 359 case AMDGPU::FLAT_LOAD_DWORDX3: 360 case AMDGPU::FLAT_STORE_DWORDX3: 361 return 3; 362 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 363 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 364 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 365 case AMDGPU::S_LOAD_DWORDX4_IMM: 366 case AMDGPU::GLOBAL_LOAD_DWORDX4: 367 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 368 case AMDGPU::GLOBAL_STORE_DWORDX4: 369 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 370 case AMDGPU::FLAT_LOAD_DWORDX4: 371 case AMDGPU::FLAT_STORE_DWORDX4: 372 return 4; 373 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 376 case AMDGPU::S_LOAD_DWORDX8_IMM: 377 return 8; 378 case AMDGPU::DS_READ_B32: [[fallthrough]]; 379 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]]; 380 case AMDGPU::DS_WRITE_B32: [[fallthrough]]; 381 case AMDGPU::DS_WRITE_B32_gfx9: 382 return 1; 383 case AMDGPU::DS_READ_B64: [[fallthrough]]; 384 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]]; 385 case AMDGPU::DS_WRITE_B64: [[fallthrough]]; 386 case AMDGPU::DS_WRITE_B64_gfx9: 387 return 2; 388 default: 389 return 0; 390 } 391 } 392 393 /// Maps instruction opcode to enum InstClassEnum. 394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 395 switch (Opc) { 396 default: 397 if (TII.isMUBUF(Opc)) { 398 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 399 default: 400 return UNKNOWN; 401 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 403 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 405 return BUFFER_LOAD; 406 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 407 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 408 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 409 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 410 return BUFFER_STORE; 411 } 412 } 413 if (TII.isMIMG(Opc)) { 414 // Ignore instructions encoded without vaddr. 415 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) && 416 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0)) 417 return UNKNOWN; 418 // Ignore BVH instructions 419 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 420 return UNKNOWN; 421 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 422 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 423 TII.isGather4(Opc)) 424 return UNKNOWN; 425 return MIMG; 426 } 427 if (TII.isMTBUF(Opc)) { 428 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 429 default: 430 return UNKNOWN; 431 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 432 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 433 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 434 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 435 return TBUFFER_LOAD; 436 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 437 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 438 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 439 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 440 return TBUFFER_STORE; 441 } 442 } 443 return UNKNOWN; 444 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 445 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 446 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 447 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 448 return S_BUFFER_LOAD_IMM; 449 // For the purposes of this optimization SGPR variants of buffer loads 450 // are considered to be zero-offsetted SGPR_IMM loads. 451 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 452 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 453 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 454 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 455 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 458 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 459 return S_BUFFER_LOAD_SGPR_IMM; 460 case AMDGPU::S_LOAD_DWORD_IMM: 461 case AMDGPU::S_LOAD_DWORDX2_IMM: 462 case AMDGPU::S_LOAD_DWORDX4_IMM: 463 case AMDGPU::S_LOAD_DWORDX8_IMM: 464 return S_LOAD_IMM; 465 case AMDGPU::DS_READ_B32: 466 case AMDGPU::DS_READ_B32_gfx9: 467 case AMDGPU::DS_READ_B64: 468 case AMDGPU::DS_READ_B64_gfx9: 469 return DS_READ; 470 case AMDGPU::DS_WRITE_B32: 471 case AMDGPU::DS_WRITE_B32_gfx9: 472 case AMDGPU::DS_WRITE_B64: 473 case AMDGPU::DS_WRITE_B64_gfx9: 474 return DS_WRITE; 475 case AMDGPU::GLOBAL_LOAD_DWORD: 476 case AMDGPU::GLOBAL_LOAD_DWORDX2: 477 case AMDGPU::GLOBAL_LOAD_DWORDX3: 478 case AMDGPU::GLOBAL_LOAD_DWORDX4: 479 case AMDGPU::FLAT_LOAD_DWORD: 480 case AMDGPU::FLAT_LOAD_DWORDX2: 481 case AMDGPU::FLAT_LOAD_DWORDX3: 482 case AMDGPU::FLAT_LOAD_DWORDX4: 483 return FLAT_LOAD; 484 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 485 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 486 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 487 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 488 return GLOBAL_LOAD_SADDR; 489 case AMDGPU::GLOBAL_STORE_DWORD: 490 case AMDGPU::GLOBAL_STORE_DWORDX2: 491 case AMDGPU::GLOBAL_STORE_DWORDX3: 492 case AMDGPU::GLOBAL_STORE_DWORDX4: 493 case AMDGPU::FLAT_STORE_DWORD: 494 case AMDGPU::FLAT_STORE_DWORDX2: 495 case AMDGPU::FLAT_STORE_DWORDX3: 496 case AMDGPU::FLAT_STORE_DWORDX4: 497 return FLAT_STORE; 498 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 499 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 500 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 501 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 502 return GLOBAL_STORE_SADDR; 503 } 504 } 505 506 /// Determines instruction subclass from opcode. Only instructions 507 /// of the same subclass can be merged together. The merged instruction may have 508 /// a different subclass but must have the same class. 509 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 510 switch (Opc) { 511 default: 512 if (TII.isMUBUF(Opc)) 513 return AMDGPU::getMUBUFBaseOpcode(Opc); 514 if (TII.isMIMG(Opc)) { 515 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 516 assert(Info); 517 return Info->BaseOpcode; 518 } 519 if (TII.isMTBUF(Opc)) 520 return AMDGPU::getMTBUFBaseOpcode(Opc); 521 return -1; 522 case AMDGPU::DS_READ_B32: 523 case AMDGPU::DS_READ_B32_gfx9: 524 case AMDGPU::DS_READ_B64: 525 case AMDGPU::DS_READ_B64_gfx9: 526 case AMDGPU::DS_WRITE_B32: 527 case AMDGPU::DS_WRITE_B32_gfx9: 528 case AMDGPU::DS_WRITE_B64: 529 case AMDGPU::DS_WRITE_B64_gfx9: 530 return Opc; 531 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 532 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 533 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 534 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 535 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 536 // For the purposes of this optimization SGPR variants of buffer loads 537 // are considered to be zero-offsetted SGPR_IMM loads. 538 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 540 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 541 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 542 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 543 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 544 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 545 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 546 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM; 547 case AMDGPU::S_LOAD_DWORD_IMM: 548 case AMDGPU::S_LOAD_DWORDX2_IMM: 549 case AMDGPU::S_LOAD_DWORDX4_IMM: 550 case AMDGPU::S_LOAD_DWORDX8_IMM: 551 return AMDGPU::S_LOAD_DWORD_IMM; 552 case AMDGPU::GLOBAL_LOAD_DWORD: 553 case AMDGPU::GLOBAL_LOAD_DWORDX2: 554 case AMDGPU::GLOBAL_LOAD_DWORDX3: 555 case AMDGPU::GLOBAL_LOAD_DWORDX4: 556 case AMDGPU::FLAT_LOAD_DWORD: 557 case AMDGPU::FLAT_LOAD_DWORDX2: 558 case AMDGPU::FLAT_LOAD_DWORDX3: 559 case AMDGPU::FLAT_LOAD_DWORDX4: 560 return AMDGPU::FLAT_LOAD_DWORD; 561 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 562 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 563 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 564 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 565 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 566 case AMDGPU::GLOBAL_STORE_DWORD: 567 case AMDGPU::GLOBAL_STORE_DWORDX2: 568 case AMDGPU::GLOBAL_STORE_DWORDX3: 569 case AMDGPU::GLOBAL_STORE_DWORDX4: 570 case AMDGPU::FLAT_STORE_DWORD: 571 case AMDGPU::FLAT_STORE_DWORDX2: 572 case AMDGPU::FLAT_STORE_DWORDX3: 573 case AMDGPU::FLAT_STORE_DWORDX4: 574 return AMDGPU::FLAT_STORE_DWORD; 575 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 576 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 577 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 578 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 579 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 580 } 581 } 582 583 // GLOBAL loads and stores are classified as FLAT initially. If both combined 584 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 585 // If either or both instructions are non segment specific FLAT the resulting 586 // combined operation will be FLAT, potentially promoting one of the GLOBAL 587 // operations to FLAT. 588 // For other instructions return the original unmodified class. 589 InstClassEnum 590 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 591 const CombineInfo &Paired) { 592 assert(CI.InstClass == Paired.InstClass); 593 594 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 595 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 596 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 597 598 return CI.InstClass; 599 } 600 601 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 602 AddressRegs Result; 603 604 if (TII.isMUBUF(Opc)) { 605 if (AMDGPU::getMUBUFHasVAddr(Opc)) 606 Result.VAddr = true; 607 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 608 Result.SRsrc = true; 609 if (AMDGPU::getMUBUFHasSoffset(Opc)) 610 Result.SOffset = true; 611 612 return Result; 613 } 614 615 if (TII.isMIMG(Opc)) { 616 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 617 if (VAddr0Idx >= 0) { 618 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 619 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 620 } else { 621 Result.VAddr = true; 622 } 623 Result.SRsrc = true; 624 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 625 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 626 Result.SSamp = true; 627 628 return Result; 629 } 630 if (TII.isMTBUF(Opc)) { 631 if (AMDGPU::getMTBUFHasVAddr(Opc)) 632 Result.VAddr = true; 633 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 634 Result.SRsrc = true; 635 if (AMDGPU::getMTBUFHasSoffset(Opc)) 636 Result.SOffset = true; 637 638 return Result; 639 } 640 641 switch (Opc) { 642 default: 643 return Result; 644 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: 645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: 646 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: 647 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: 648 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: 649 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: 650 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: 651 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: 652 Result.SOffset = true; 653 [[fallthrough]]; 654 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 655 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 656 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 657 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 658 case AMDGPU::S_LOAD_DWORD_IMM: 659 case AMDGPU::S_LOAD_DWORDX2_IMM: 660 case AMDGPU::S_LOAD_DWORDX4_IMM: 661 case AMDGPU::S_LOAD_DWORDX8_IMM: 662 Result.SBase = true; 663 return Result; 664 case AMDGPU::DS_READ_B32: 665 case AMDGPU::DS_READ_B64: 666 case AMDGPU::DS_READ_B32_gfx9: 667 case AMDGPU::DS_READ_B64_gfx9: 668 case AMDGPU::DS_WRITE_B32: 669 case AMDGPU::DS_WRITE_B64: 670 case AMDGPU::DS_WRITE_B32_gfx9: 671 case AMDGPU::DS_WRITE_B64_gfx9: 672 Result.Addr = true; 673 return Result; 674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 678 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 679 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 680 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 681 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 682 Result.SAddr = true; 683 [[fallthrough]]; 684 case AMDGPU::GLOBAL_LOAD_DWORD: 685 case AMDGPU::GLOBAL_LOAD_DWORDX2: 686 case AMDGPU::GLOBAL_LOAD_DWORDX3: 687 case AMDGPU::GLOBAL_LOAD_DWORDX4: 688 case AMDGPU::GLOBAL_STORE_DWORD: 689 case AMDGPU::GLOBAL_STORE_DWORDX2: 690 case AMDGPU::GLOBAL_STORE_DWORDX3: 691 case AMDGPU::GLOBAL_STORE_DWORDX4: 692 case AMDGPU::FLAT_LOAD_DWORD: 693 case AMDGPU::FLAT_LOAD_DWORDX2: 694 case AMDGPU::FLAT_LOAD_DWORDX3: 695 case AMDGPU::FLAT_LOAD_DWORDX4: 696 case AMDGPU::FLAT_STORE_DWORD: 697 case AMDGPU::FLAT_STORE_DWORDX2: 698 case AMDGPU::FLAT_STORE_DWORDX3: 699 case AMDGPU::FLAT_STORE_DWORDX4: 700 Result.VAddr = true; 701 return Result; 702 } 703 } 704 705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 706 const SILoadStoreOptimizer &LSO) { 707 I = MI; 708 unsigned Opc = MI->getOpcode(); 709 InstClass = getInstClass(Opc, *LSO.TII); 710 711 if (InstClass == UNKNOWN) 712 return; 713 714 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 715 716 switch (InstClass) { 717 case DS_READ: 718 EltSize = 719 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 720 : 4; 721 break; 722 case DS_WRITE: 723 EltSize = 724 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 725 : 4; 726 break; 727 case S_BUFFER_LOAD_IMM: 728 case S_BUFFER_LOAD_SGPR_IMM: 729 case S_LOAD_IMM: 730 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 731 break; 732 default: 733 EltSize = 4; 734 break; 735 } 736 737 if (InstClass == MIMG) { 738 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 739 // Offset is not considered for MIMG instructions. 740 Offset = 0; 741 } else { 742 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 743 Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm(); 744 } 745 746 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 747 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 748 749 Width = getOpcodeWidth(*I, *LSO.TII); 750 751 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 752 Offset &= 0xffff; 753 } else if (InstClass != MIMG) { 754 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 755 } 756 757 AddressRegs Regs = getRegs(Opc, *LSO.TII); 758 759 NumAddresses = 0; 760 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 761 AddrIdx[NumAddresses++] = 762 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 763 if (Regs.Addr) 764 AddrIdx[NumAddresses++] = 765 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 766 if (Regs.SBase) 767 AddrIdx[NumAddresses++] = 768 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 769 if (Regs.SRsrc) 770 AddrIdx[NumAddresses++] = 771 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 772 if (Regs.SOffset) 773 AddrIdx[NumAddresses++] = 774 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 775 if (Regs.SAddr) 776 AddrIdx[NumAddresses++] = 777 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 778 if (Regs.VAddr) 779 AddrIdx[NumAddresses++] = 780 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 781 if (Regs.SSamp) 782 AddrIdx[NumAddresses++] = 783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 784 assert(NumAddresses <= MaxAddressRegs); 785 786 for (unsigned J = 0; J < NumAddresses; J++) 787 AddrReg[J] = &I->getOperand(AddrIdx[J]); 788 } 789 790 } // end anonymous namespace. 791 792 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 793 "SI Load Store Optimizer", false, false) 794 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 795 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 796 false, false) 797 798 char SILoadStoreOptimizer::ID = 0; 799 800 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 801 802 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 803 return new SILoadStoreOptimizer(); 804 } 805 806 static void addDefsUsesToList(const MachineInstr &MI, 807 DenseSet<Register> &RegDefs, 808 DenseSet<Register> &RegUses) { 809 for (const auto &Op : MI.operands()) { 810 if (!Op.isReg()) 811 continue; 812 if (Op.isDef()) 813 RegDefs.insert(Op.getReg()); 814 if (Op.readsReg()) 815 RegUses.insert(Op.getReg()); 816 } 817 } 818 819 bool SILoadStoreOptimizer::canSwapInstructions( 820 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 821 const MachineInstr &A, const MachineInstr &B) const { 822 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 823 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 824 return false; 825 for (const auto &BOp : B.operands()) { 826 if (!BOp.isReg()) 827 continue; 828 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 829 return false; 830 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 831 return false; 832 } 833 return true; 834 } 835 836 // Given that \p CI and \p Paired are adjacent memory operations produce a new 837 // MMO for the combined operation with a new access size. 838 MachineMemOperand * 839 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 840 const CombineInfo &Paired) { 841 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 842 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 843 844 unsigned Size = MMOa->getSize() + MMOb->getSize(); 845 846 // A base pointer for the combined operation is the same as the leading 847 // operation's pointer. 848 if (Paired < CI) 849 std::swap(MMOa, MMOb); 850 851 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 852 // If merging FLAT and GLOBAL set address space to FLAT. 853 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 854 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 855 856 MachineFunction *MF = CI.I->getMF(); 857 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 858 } 859 860 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 861 const SIInstrInfo &TII, 862 const CombineInfo &Paired) { 863 assert(CI.InstClass == MIMG); 864 865 // Ignore instructions with tfe/lwe set. 866 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 867 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 868 869 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 870 return false; 871 872 // Check other optional immediate operands for equality. 873 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 874 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 875 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 876 877 for (auto op : OperandsToMatch) { 878 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 879 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 880 return false; 881 if (Idx != -1 && 882 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 883 return false; 884 } 885 886 // Check DMask for overlaps. 887 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 888 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 889 890 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 891 if ((1u << AllowedBitsForMin) <= MinMask) 892 return false; 893 894 return true; 895 } 896 897 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 898 unsigned ComponentCount, 899 const GCNSubtarget &STI) { 900 if (ComponentCount > 4) 901 return 0; 902 903 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 904 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 905 if (!OldFormatInfo) 906 return 0; 907 908 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 909 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 910 ComponentCount, 911 OldFormatInfo->NumFormat, STI); 912 913 if (!NewFormatInfo) 914 return 0; 915 916 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 917 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 918 919 return NewFormatInfo->Format; 920 } 921 922 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 923 // highest power of two. Note that the result is well defined for all inputs 924 // including corner cases like: 925 // - if Lo == Hi, return that value 926 // - if Lo == 0, return 0 (even though the "- 1" below underflows 927 // - if Lo > Hi, return 0 (as if the range wrapped around) 928 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 929 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 930 } 931 932 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 933 const GCNSubtarget &STI, 934 CombineInfo &Paired, 935 bool Modify) { 936 assert(CI.InstClass != MIMG); 937 938 // XXX - Would the same offset be OK? Is there any reason this would happen or 939 // be useful? 940 if (CI.Offset == Paired.Offset) 941 return false; 942 943 // This won't be valid if the offset isn't aligned. 944 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 945 return false; 946 947 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 948 949 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 950 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 951 if (!Info0) 952 return false; 953 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 954 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 955 if (!Info1) 956 return false; 957 958 if (Info0->BitsPerComp != Info1->BitsPerComp || 959 Info0->NumFormat != Info1->NumFormat) 960 return false; 961 962 // TODO: Should be possible to support more formats, but if format loads 963 // are not dword-aligned, the merged load might not be valid. 964 if (Info0->BitsPerComp != 32) 965 return false; 966 967 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 968 return false; 969 } 970 971 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 972 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 973 CI.UseST64 = false; 974 CI.BaseOff = 0; 975 976 // Handle all non-DS instructions. 977 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 978 return (EltOffset0 + CI.Width == EltOffset1 || 979 EltOffset1 + Paired.Width == EltOffset0) && 980 CI.CPol == Paired.CPol; 981 } 982 983 // If the offset in elements doesn't fit in 8-bits, we might be able to use 984 // the stride 64 versions. 985 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 986 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 987 if (Modify) { 988 CI.Offset = EltOffset0 / 64; 989 Paired.Offset = EltOffset1 / 64; 990 CI.UseST64 = true; 991 } 992 return true; 993 } 994 995 // Check if the new offsets fit in the reduced 8-bit range. 996 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 997 if (Modify) { 998 CI.Offset = EltOffset0; 999 Paired.Offset = EltOffset1; 1000 } 1001 return true; 1002 } 1003 1004 // Try to shift base address to decrease offsets. 1005 uint32_t Min = std::min(EltOffset0, EltOffset1); 1006 uint32_t Max = std::max(EltOffset0, EltOffset1); 1007 1008 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 1009 if (((Max - Min) & ~Mask) == 0) { 1010 if (Modify) { 1011 // From the range of values we could use for BaseOff, choose the one that 1012 // is aligned to the highest power of two, to maximise the chance that 1013 // the same offset can be reused for other load/store pairs. 1014 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 1015 // Copy the low bits of the offsets, so that when we adjust them by 1016 // subtracting BaseOff they will be multiples of 64. 1017 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 1018 CI.BaseOff = BaseOff * CI.EltSize; 1019 CI.Offset = (EltOffset0 - BaseOff) / 64; 1020 Paired.Offset = (EltOffset1 - BaseOff) / 64; 1021 CI.UseST64 = true; 1022 } 1023 return true; 1024 } 1025 1026 if (isUInt<8>(Max - Min)) { 1027 if (Modify) { 1028 // From the range of values we could use for BaseOff, choose the one that 1029 // is aligned to the highest power of two, to maximise the chance that 1030 // the same offset can be reused for other load/store pairs. 1031 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 1032 CI.BaseOff = BaseOff * CI.EltSize; 1033 CI.Offset = EltOffset0 - BaseOff; 1034 Paired.Offset = EltOffset1 - BaseOff; 1035 } 1036 return true; 1037 } 1038 1039 return false; 1040 } 1041 1042 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 1043 const CombineInfo &CI, 1044 const CombineInfo &Paired) { 1045 const unsigned Width = (CI.Width + Paired.Width); 1046 switch (CI.InstClass) { 1047 default: 1048 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 1049 case S_BUFFER_LOAD_IMM: 1050 case S_BUFFER_LOAD_SGPR_IMM: 1051 case S_LOAD_IMM: 1052 switch (Width) { 1053 default: 1054 return false; 1055 case 2: 1056 case 4: 1057 case 8: 1058 return true; 1059 } 1060 } 1061 } 1062 1063 const TargetRegisterClass * 1064 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 1065 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 1066 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1067 } 1068 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1069 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1070 } 1071 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1072 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1073 } 1074 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1075 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1076 } 1077 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1078 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1079 } 1080 return nullptr; 1081 } 1082 1083 /// This function assumes that CI comes before Paired in a basic block. Return 1084 /// an insertion point for the merged instruction or nullptr on failure. 1085 SILoadStoreOptimizer::CombineInfo * 1086 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1087 CombineInfo &Paired) { 1088 // If another instruction has already been merged into CI, it may now be a 1089 // type that we can't do any further merging into. 1090 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1091 return nullptr; 1092 assert(CI.InstClass == Paired.InstClass); 1093 1094 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1095 getInstSubclass(Paired.I->getOpcode(), *TII)) 1096 return nullptr; 1097 1098 // Check both offsets (or masks for MIMG) can be combined and fit in the 1099 // reduced range. 1100 if (CI.InstClass == MIMG) { 1101 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1102 return nullptr; 1103 } else { 1104 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1105 return nullptr; 1106 } 1107 1108 DenseSet<Register> RegDefs; 1109 DenseSet<Register> RegUses; 1110 CombineInfo *Where; 1111 if (CI.I->mayLoad()) { 1112 // Try to hoist Paired up to CI. 1113 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1114 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1115 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1116 return nullptr; 1117 } 1118 Where = &CI; 1119 } else { 1120 // Try to sink CI down to Paired. 1121 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1122 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1123 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1124 return nullptr; 1125 } 1126 Where = &Paired; 1127 } 1128 1129 // Call offsetsCanBeCombined with modify = true so that the offsets are 1130 // correct for the new instruction. This should return true, because 1131 // this function should only be called on CombineInfo objects that 1132 // have already been confirmed to be mergeable. 1133 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1134 offsetsCanBeCombined(CI, *STM, Paired, true); 1135 return Where; 1136 } 1137 1138 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1139 if (STM->ldsRequiresM0Init()) 1140 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1141 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1142 } 1143 1144 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1145 if (STM->ldsRequiresM0Init()) 1146 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1147 1148 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1149 : AMDGPU::DS_READ2ST64_B64_gfx9; 1150 } 1151 1152 MachineBasicBlock::iterator 1153 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1154 MachineBasicBlock::iterator InsertBefore) { 1155 MachineBasicBlock *MBB = CI.I->getParent(); 1156 1157 // Be careful, since the addresses could be subregisters themselves in weird 1158 // cases, like vectors of pointers. 1159 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1160 1161 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1162 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1163 1164 unsigned NewOffset0 = CI.Offset; 1165 unsigned NewOffset1 = Paired.Offset; 1166 unsigned Opc = 1167 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1168 1169 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1170 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1171 1172 if (NewOffset0 > NewOffset1) { 1173 // Canonicalize the merged instruction so the smaller offset comes first. 1174 std::swap(NewOffset0, NewOffset1); 1175 std::swap(SubRegIdx0, SubRegIdx1); 1176 } 1177 1178 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1179 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1180 1181 const MCInstrDesc &Read2Desc = TII->get(Opc); 1182 1183 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1184 Register DestReg = MRI->createVirtualRegister(SuperRC); 1185 1186 DebugLoc DL = CI.I->getDebugLoc(); 1187 1188 Register BaseReg = AddrReg->getReg(); 1189 unsigned BaseSubReg = AddrReg->getSubReg(); 1190 unsigned BaseRegFlags = 0; 1191 if (CI.BaseOff) { 1192 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1193 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1194 .addImm(CI.BaseOff); 1195 1196 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1197 BaseRegFlags = RegState::Kill; 1198 1199 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1200 .addReg(ImmReg) 1201 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1202 .addImm(0); // clamp bit 1203 BaseSubReg = 0; 1204 } 1205 1206 MachineInstrBuilder Read2 = 1207 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1208 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1209 .addImm(NewOffset0) // offset0 1210 .addImm(NewOffset1) // offset1 1211 .addImm(0) // gds 1212 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1213 1214 (void)Read2; 1215 1216 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1217 1218 // Copy to the old destination registers. 1219 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1220 .add(*Dest0) // Copy to same destination including flags and sub reg. 1221 .addReg(DestReg, 0, SubRegIdx0); 1222 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1223 .add(*Dest1) 1224 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1225 1226 CI.I->eraseFromParent(); 1227 Paired.I->eraseFromParent(); 1228 1229 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1230 return Read2; 1231 } 1232 1233 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1234 if (STM->ldsRequiresM0Init()) 1235 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1236 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1237 : AMDGPU::DS_WRITE2_B64_gfx9; 1238 } 1239 1240 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1241 if (STM->ldsRequiresM0Init()) 1242 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1243 : AMDGPU::DS_WRITE2ST64_B64; 1244 1245 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1246 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1247 } 1248 1249 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1250 CombineInfo &CI, CombineInfo &Paired, 1251 MachineBasicBlock::iterator InsertBefore) { 1252 MachineBasicBlock *MBB = CI.I->getParent(); 1253 1254 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1255 // sure we preserve the subregister index and any register flags set on them. 1256 const MachineOperand *AddrReg = 1257 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1258 const MachineOperand *Data0 = 1259 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1260 const MachineOperand *Data1 = 1261 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1262 1263 unsigned NewOffset0 = CI.Offset; 1264 unsigned NewOffset1 = Paired.Offset; 1265 unsigned Opc = 1266 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1267 1268 if (NewOffset0 > NewOffset1) { 1269 // Canonicalize the merged instruction so the smaller offset comes first. 1270 std::swap(NewOffset0, NewOffset1); 1271 std::swap(Data0, Data1); 1272 } 1273 1274 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1275 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1276 1277 const MCInstrDesc &Write2Desc = TII->get(Opc); 1278 DebugLoc DL = CI.I->getDebugLoc(); 1279 1280 Register BaseReg = AddrReg->getReg(); 1281 unsigned BaseSubReg = AddrReg->getSubReg(); 1282 unsigned BaseRegFlags = 0; 1283 if (CI.BaseOff) { 1284 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1285 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1286 .addImm(CI.BaseOff); 1287 1288 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1289 BaseRegFlags = RegState::Kill; 1290 1291 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1292 .addReg(ImmReg) 1293 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1294 .addImm(0); // clamp bit 1295 BaseSubReg = 0; 1296 } 1297 1298 MachineInstrBuilder Write2 = 1299 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1300 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1301 .add(*Data0) // data0 1302 .add(*Data1) // data1 1303 .addImm(NewOffset0) // offset0 1304 .addImm(NewOffset1) // offset1 1305 .addImm(0) // gds 1306 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1307 1308 CI.I->eraseFromParent(); 1309 Paired.I->eraseFromParent(); 1310 1311 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1312 return Write2; 1313 } 1314 1315 MachineBasicBlock::iterator 1316 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1317 MachineBasicBlock::iterator InsertBefore) { 1318 MachineBasicBlock *MBB = CI.I->getParent(); 1319 DebugLoc DL = CI.I->getDebugLoc(); 1320 const unsigned Opcode = getNewOpcode(CI, Paired); 1321 1322 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1323 1324 Register DestReg = MRI->createVirtualRegister(SuperRC); 1325 unsigned MergedDMask = CI.DMask | Paired.DMask; 1326 unsigned DMaskIdx = 1327 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1328 1329 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1330 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1331 if (I == DMaskIdx) 1332 MIB.addImm(MergedDMask); 1333 else 1334 MIB.add((*CI.I).getOperand(I)); 1335 } 1336 1337 // It shouldn't be possible to get this far if the two instructions 1338 // don't have a single memoperand, because MachineInstr::mayAlias() 1339 // will return true if this is the case. 1340 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1341 1342 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1343 1344 unsigned SubRegIdx0, SubRegIdx1; 1345 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1346 1347 // Copy to the old destination registers. 1348 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1349 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1350 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1351 1352 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1353 .add(*Dest0) // Copy to same destination including flags and sub reg. 1354 .addReg(DestReg, 0, SubRegIdx0); 1355 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1356 .add(*Dest1) 1357 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1358 1359 CI.I->eraseFromParent(); 1360 Paired.I->eraseFromParent(); 1361 return New; 1362 } 1363 1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( 1365 CombineInfo &CI, CombineInfo &Paired, 1366 MachineBasicBlock::iterator InsertBefore) { 1367 MachineBasicBlock *MBB = CI.I->getParent(); 1368 DebugLoc DL = CI.I->getDebugLoc(); 1369 const unsigned Opcode = getNewOpcode(CI, Paired); 1370 1371 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1372 1373 Register DestReg = MRI->createVirtualRegister(SuperRC); 1374 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1375 1376 // It shouldn't be possible to get this far if the two instructions 1377 // don't have a single memoperand, because MachineInstr::mayAlias() 1378 // will return true if this is the case. 1379 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1380 1381 MachineInstrBuilder New = 1382 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1383 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); 1384 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) 1385 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); 1386 // For convenience, when SGPR_IMM buffer loads are merged into a 1387 // zero-offset load, we generate its SGPR variant. 1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset)) 1389 New.addImm(MergedOffset); 1390 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1391 1392 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1393 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1394 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1395 1396 // Copy to the old destination registers. 1397 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1398 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1399 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1400 1401 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1402 .add(*Dest0) // Copy to same destination including flags and sub reg. 1403 .addReg(DestReg, 0, SubRegIdx0); 1404 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1405 .add(*Dest1) 1406 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1407 1408 CI.I->eraseFromParent(); 1409 Paired.I->eraseFromParent(); 1410 return New; 1411 } 1412 1413 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1414 CombineInfo &CI, CombineInfo &Paired, 1415 MachineBasicBlock::iterator InsertBefore) { 1416 MachineBasicBlock *MBB = CI.I->getParent(); 1417 DebugLoc DL = CI.I->getDebugLoc(); 1418 1419 const unsigned Opcode = getNewOpcode(CI, Paired); 1420 1421 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1422 1423 // Copy to the new source register. 1424 Register DestReg = MRI->createVirtualRegister(SuperRC); 1425 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1426 1427 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1428 1429 AddressRegs Regs = getRegs(Opcode, *TII); 1430 1431 if (Regs.VAddr) 1432 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1433 1434 // It shouldn't be possible to get this far if the two instructions 1435 // don't have a single memoperand, because MachineInstr::mayAlias() 1436 // will return true if this is the case. 1437 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1438 1439 MachineInstr *New = 1440 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1441 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1442 .addImm(MergedOffset) // offset 1443 .addImm(CI.CPol) // cpol 1444 .addImm(0) // swz 1445 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1446 1447 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1448 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1449 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1450 1451 // Copy to the old destination registers. 1452 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1453 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1454 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1455 1456 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1457 .add(*Dest0) // Copy to same destination including flags and sub reg. 1458 .addReg(DestReg, 0, SubRegIdx0); 1459 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1460 .add(*Dest1) 1461 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1462 1463 CI.I->eraseFromParent(); 1464 Paired.I->eraseFromParent(); 1465 return New; 1466 } 1467 1468 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1469 CombineInfo &CI, CombineInfo &Paired, 1470 MachineBasicBlock::iterator InsertBefore) { 1471 MachineBasicBlock *MBB = CI.I->getParent(); 1472 DebugLoc DL = CI.I->getDebugLoc(); 1473 1474 const unsigned Opcode = getNewOpcode(CI, Paired); 1475 1476 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1477 1478 // Copy to the new source register. 1479 Register DestReg = MRI->createVirtualRegister(SuperRC); 1480 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1481 1482 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1483 1484 AddressRegs Regs = getRegs(Opcode, *TII); 1485 1486 if (Regs.VAddr) 1487 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1488 1489 unsigned JoinedFormat = 1490 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1491 1492 // It shouldn't be possible to get this far if the two instructions 1493 // don't have a single memoperand, because MachineInstr::mayAlias() 1494 // will return true if this is the case. 1495 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1496 1497 MachineInstr *New = 1498 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1499 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1500 .addImm(MergedOffset) // offset 1501 .addImm(JoinedFormat) // format 1502 .addImm(CI.CPol) // cpol 1503 .addImm(0) // swz 1504 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1505 1506 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1507 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1508 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1509 1510 // Copy to the old destination registers. 1511 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1512 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1513 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1514 1515 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1516 .add(*Dest0) // Copy to same destination including flags and sub reg. 1517 .addReg(DestReg, 0, SubRegIdx0); 1518 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1519 .add(*Dest1) 1520 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1521 1522 CI.I->eraseFromParent(); 1523 Paired.I->eraseFromParent(); 1524 return New; 1525 } 1526 1527 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1528 CombineInfo &CI, CombineInfo &Paired, 1529 MachineBasicBlock::iterator InsertBefore) { 1530 MachineBasicBlock *MBB = CI.I->getParent(); 1531 DebugLoc DL = CI.I->getDebugLoc(); 1532 1533 const unsigned Opcode = getNewOpcode(CI, Paired); 1534 1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1538 1539 // Copy to the new source register. 1540 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1541 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1542 1543 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1544 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1545 1546 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1547 .add(*Src0) 1548 .addImm(SubRegIdx0) 1549 .add(*Src1) 1550 .addImm(SubRegIdx1); 1551 1552 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1553 .addReg(SrcReg, RegState::Kill); 1554 1555 AddressRegs Regs = getRegs(Opcode, *TII); 1556 1557 if (Regs.VAddr) 1558 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1559 1560 unsigned JoinedFormat = 1561 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1562 1563 // It shouldn't be possible to get this far if the two instructions 1564 // don't have a single memoperand, because MachineInstr::mayAlias() 1565 // will return true if this is the case. 1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1567 1568 MachineInstr *New = 1569 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1570 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1571 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1572 .addImm(JoinedFormat) // format 1573 .addImm(CI.CPol) // cpol 1574 .addImm(0) // swz 1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1576 1577 CI.I->eraseFromParent(); 1578 Paired.I->eraseFromParent(); 1579 return New; 1580 } 1581 1582 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1583 CombineInfo &CI, CombineInfo &Paired, 1584 MachineBasicBlock::iterator InsertBefore) { 1585 MachineBasicBlock *MBB = CI.I->getParent(); 1586 DebugLoc DL = CI.I->getDebugLoc(); 1587 1588 const unsigned Opcode = getNewOpcode(CI, Paired); 1589 1590 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1591 Register DestReg = MRI->createVirtualRegister(SuperRC); 1592 1593 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1594 1595 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1596 MIB.add(*SAddr); 1597 1598 MachineInstr *New = 1599 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1600 .addImm(std::min(CI.Offset, Paired.Offset)) 1601 .addImm(CI.CPol) 1602 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1603 1604 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1605 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1606 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1607 1608 // Copy to the old destination registers. 1609 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1610 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1611 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1612 1613 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1614 .add(*Dest0) // Copy to same destination including flags and sub reg. 1615 .addReg(DestReg, 0, SubRegIdx0); 1616 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1617 .add(*Dest1) 1618 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1619 1620 CI.I->eraseFromParent(); 1621 Paired.I->eraseFromParent(); 1622 return New; 1623 } 1624 1625 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1626 CombineInfo &CI, CombineInfo &Paired, 1627 MachineBasicBlock::iterator InsertBefore) { 1628 MachineBasicBlock *MBB = CI.I->getParent(); 1629 DebugLoc DL = CI.I->getDebugLoc(); 1630 1631 const unsigned Opcode = getNewOpcode(CI, Paired); 1632 1633 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1634 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1635 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1636 1637 // Copy to the new source register. 1638 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1639 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1640 1641 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1642 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1643 1644 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1645 .add(*Src0) 1646 .addImm(SubRegIdx0) 1647 .add(*Src1) 1648 .addImm(SubRegIdx1); 1649 1650 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1651 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1652 .addReg(SrcReg, RegState::Kill); 1653 1654 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1655 MIB.add(*SAddr); 1656 1657 MachineInstr *New = 1658 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1659 .addImm(CI.CPol) 1660 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1661 1662 CI.I->eraseFromParent(); 1663 Paired.I->eraseFromParent(); 1664 return New; 1665 } 1666 1667 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1668 const CombineInfo &Paired) { 1669 const unsigned Width = CI.Width + Paired.Width; 1670 1671 switch (getCommonInstClass(CI, Paired)) { 1672 default: 1673 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1674 // FIXME: Handle d16 correctly 1675 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1676 Width); 1677 case TBUFFER_LOAD: 1678 case TBUFFER_STORE: 1679 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1680 Width); 1681 1682 case UNKNOWN: 1683 llvm_unreachable("Unknown instruction class"); 1684 case S_BUFFER_LOAD_IMM: 1685 switch (Width) { 1686 default: 1687 return 0; 1688 case 2: 1689 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1690 case 4: 1691 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1692 case 8: 1693 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1694 } 1695 case S_BUFFER_LOAD_SGPR_IMM: 1696 switch (Width) { 1697 default: 1698 return 0; 1699 case 2: 1700 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR 1701 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; 1702 case 4: 1703 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR 1704 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; 1705 case 8: 1706 return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR 1707 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; 1708 } 1709 case S_LOAD_IMM: 1710 switch (Width) { 1711 default: 1712 return 0; 1713 case 2: 1714 return AMDGPU::S_LOAD_DWORDX2_IMM; 1715 case 4: 1716 return AMDGPU::S_LOAD_DWORDX4_IMM; 1717 case 8: 1718 return AMDGPU::S_LOAD_DWORDX8_IMM; 1719 } 1720 case GLOBAL_LOAD: 1721 switch (Width) { 1722 default: 1723 return 0; 1724 case 2: 1725 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1726 case 3: 1727 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1728 case 4: 1729 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1730 } 1731 case GLOBAL_LOAD_SADDR: 1732 switch (Width) { 1733 default: 1734 return 0; 1735 case 2: 1736 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1737 case 3: 1738 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1739 case 4: 1740 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1741 } 1742 case GLOBAL_STORE: 1743 switch (Width) { 1744 default: 1745 return 0; 1746 case 2: 1747 return AMDGPU::GLOBAL_STORE_DWORDX2; 1748 case 3: 1749 return AMDGPU::GLOBAL_STORE_DWORDX3; 1750 case 4: 1751 return AMDGPU::GLOBAL_STORE_DWORDX4; 1752 } 1753 case GLOBAL_STORE_SADDR: 1754 switch (Width) { 1755 default: 1756 return 0; 1757 case 2: 1758 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1759 case 3: 1760 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1761 case 4: 1762 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1763 } 1764 case FLAT_LOAD: 1765 switch (Width) { 1766 default: 1767 return 0; 1768 case 2: 1769 return AMDGPU::FLAT_LOAD_DWORDX2; 1770 case 3: 1771 return AMDGPU::FLAT_LOAD_DWORDX3; 1772 case 4: 1773 return AMDGPU::FLAT_LOAD_DWORDX4; 1774 } 1775 case FLAT_STORE: 1776 switch (Width) { 1777 default: 1778 return 0; 1779 case 2: 1780 return AMDGPU::FLAT_STORE_DWORDX2; 1781 case 3: 1782 return AMDGPU::FLAT_STORE_DWORDX3; 1783 case 4: 1784 return AMDGPU::FLAT_STORE_DWORDX4; 1785 } 1786 case MIMG: 1787 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) && 1788 "No overlaps"); 1789 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1790 } 1791 } 1792 1793 std::pair<unsigned, unsigned> 1794 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1795 const CombineInfo &Paired) { 1796 assert((CI.InstClass != MIMG || 1797 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == 1798 CI.Width + Paired.Width)) && 1799 "No overlaps"); 1800 1801 unsigned Idx0; 1802 unsigned Idx1; 1803 1804 static const unsigned Idxs[5][4] = { 1805 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1806 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1807 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1808 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1809 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1810 }; 1811 1812 assert(CI.Width >= 1 && CI.Width <= 4); 1813 assert(Paired.Width >= 1 && Paired.Width <= 4); 1814 1815 if (Paired < CI) { 1816 Idx1 = Idxs[0][Paired.Width - 1]; 1817 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1818 } else { 1819 Idx0 = Idxs[0][CI.Width - 1]; 1820 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1821 } 1822 1823 return std::pair(Idx0, Idx1); 1824 } 1825 1826 const TargetRegisterClass * 1827 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1828 const CombineInfo &Paired) { 1829 if (CI.InstClass == S_BUFFER_LOAD_IMM || 1830 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) { 1831 switch (CI.Width + Paired.Width) { 1832 default: 1833 return nullptr; 1834 case 2: 1835 return &AMDGPU::SReg_64_XEXECRegClass; 1836 case 4: 1837 return &AMDGPU::SGPR_128RegClass; 1838 case 8: 1839 return &AMDGPU::SGPR_256RegClass; 1840 case 16: 1841 return &AMDGPU::SGPR_512RegClass; 1842 } 1843 } 1844 1845 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1846 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1847 ? TRI->getAGPRClassForBitWidth(BitWidth) 1848 : TRI->getVGPRClassForBitWidth(BitWidth); 1849 } 1850 1851 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1852 CombineInfo &CI, CombineInfo &Paired, 1853 MachineBasicBlock::iterator InsertBefore) { 1854 MachineBasicBlock *MBB = CI.I->getParent(); 1855 DebugLoc DL = CI.I->getDebugLoc(); 1856 1857 const unsigned Opcode = getNewOpcode(CI, Paired); 1858 1859 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1860 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1861 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1862 1863 // Copy to the new source register. 1864 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1865 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1866 1867 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1868 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1869 1870 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1871 .add(*Src0) 1872 .addImm(SubRegIdx0) 1873 .add(*Src1) 1874 .addImm(SubRegIdx1); 1875 1876 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1877 .addReg(SrcReg, RegState::Kill); 1878 1879 AddressRegs Regs = getRegs(Opcode, *TII); 1880 1881 if (Regs.VAddr) 1882 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1883 1884 1885 // It shouldn't be possible to get this far if the two instructions 1886 // don't have a single memoperand, because MachineInstr::mayAlias() 1887 // will return true if this is the case. 1888 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1889 1890 MachineInstr *New = 1891 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1892 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1893 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1894 .addImm(CI.CPol) // cpol 1895 .addImm(0) // swz 1896 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1897 1898 CI.I->eraseFromParent(); 1899 Paired.I->eraseFromParent(); 1900 return New; 1901 } 1902 1903 MachineOperand 1904 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1905 APInt V(32, Val, true); 1906 if (TII->isInlineConstant(V)) 1907 return MachineOperand::CreateImm(Val); 1908 1909 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1910 MachineInstr *Mov = 1911 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1912 TII->get(AMDGPU::S_MOV_B32), Reg) 1913 .addImm(Val); 1914 (void)Mov; 1915 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1916 return MachineOperand::CreateReg(Reg, false); 1917 } 1918 1919 // Compute base address using Addr and return the final register. 1920 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1921 const MemAddress &Addr) const { 1922 MachineBasicBlock *MBB = MI.getParent(); 1923 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1924 DebugLoc DL = MI.getDebugLoc(); 1925 1926 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1927 Addr.Base.LoSubReg) && 1928 "Expected 32-bit Base-Register-Low!!"); 1929 1930 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1931 Addr.Base.HiSubReg) && 1932 "Expected 32-bit Base-Register-Hi!!"); 1933 1934 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1935 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1936 MachineOperand OffsetHi = 1937 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1938 1939 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1940 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1941 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1942 1943 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1944 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1945 MachineInstr *LoHalf = 1946 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1947 .addReg(CarryReg, RegState::Define) 1948 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1949 .add(OffsetLo) 1950 .addImm(0); // clamp bit 1951 (void)LoHalf; 1952 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1953 1954 MachineInstr *HiHalf = 1955 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1956 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1957 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1958 .add(OffsetHi) 1959 .addReg(CarryReg, RegState::Kill) 1960 .addImm(0); // clamp bit 1961 (void)HiHalf; 1962 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1963 1964 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1965 MachineInstr *FullBase = 1966 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1967 .addReg(DestSub0) 1968 .addImm(AMDGPU::sub0) 1969 .addReg(DestSub1) 1970 .addImm(AMDGPU::sub1); 1971 (void)FullBase; 1972 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1973 1974 return FullDestReg; 1975 } 1976 1977 // Update base and offset with the NewBase and NewOffset in MI. 1978 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1979 Register NewBase, 1980 int32_t NewOffset) const { 1981 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1982 Base->setReg(NewBase); 1983 Base->setIsKill(false); 1984 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1985 } 1986 1987 std::optional<int32_t> 1988 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1989 if (Op.isImm()) 1990 return Op.getImm(); 1991 1992 if (!Op.isReg()) 1993 return std::nullopt; 1994 1995 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1996 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1997 !Def->getOperand(1).isImm()) 1998 return std::nullopt; 1999 2000 return Def->getOperand(1).getImm(); 2001 } 2002 2003 // Analyze Base and extracts: 2004 // - 32bit base registers, subregisters 2005 // - 64bit constant offset 2006 // Expecting base computation as: 2007 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 2008 // %LO:vgpr_32, %c:sreg_64_xexec = 2009 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2010 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2011 // %Base:vreg_64 = 2012 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2013 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 2014 MemAddress &Addr) const { 2015 if (!Base.isReg()) 2016 return; 2017 2018 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 2019 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 2020 || Def->getNumOperands() != 5) 2021 return; 2022 2023 MachineOperand BaseLo = Def->getOperand(1); 2024 MachineOperand BaseHi = Def->getOperand(3); 2025 if (!BaseLo.isReg() || !BaseHi.isReg()) 2026 return; 2027 2028 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 2029 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 2030 2031 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 2032 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 2033 return; 2034 2035 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 2036 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 2037 2038 auto Offset0P = extractConstOffset(*Src0); 2039 if (Offset0P) 2040 BaseLo = *Src1; 2041 else { 2042 if (!(Offset0P = extractConstOffset(*Src1))) 2043 return; 2044 BaseLo = *Src0; 2045 } 2046 2047 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 2048 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 2049 2050 if (Src0->isImm()) 2051 std::swap(Src0, Src1); 2052 2053 if (!Src1->isImm()) 2054 return; 2055 2056 uint64_t Offset1 = Src1->getImm(); 2057 BaseHi = *Src0; 2058 2059 Addr.Base.LoReg = BaseLo.getReg(); 2060 Addr.Base.HiReg = BaseHi.getReg(); 2061 Addr.Base.LoSubReg = BaseLo.getSubReg(); 2062 Addr.Base.HiSubReg = BaseHi.getSubReg(); 2063 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 2064 } 2065 2066 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 2067 MachineInstr &MI, 2068 MemInfoMap &Visited, 2069 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 2070 2071 if (!(MI.mayLoad() ^ MI.mayStore())) 2072 return false; 2073 2074 // TODO: Support flat and scratch. 2075 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 2076 return false; 2077 2078 if (MI.mayLoad() && 2079 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 2080 return false; 2081 2082 if (AnchorList.count(&MI)) 2083 return false; 2084 2085 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 2086 2087 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 2088 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 2089 return false; 2090 } 2091 2092 // Step1: Find the base-registers and a 64bit constant offset. 2093 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 2094 MemAddress MAddr; 2095 if (Visited.find(&MI) == Visited.end()) { 2096 processBaseWithConstOffset(Base, MAddr); 2097 Visited[&MI] = MAddr; 2098 } else 2099 MAddr = Visited[&MI]; 2100 2101 if (MAddr.Offset == 0) { 2102 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2103 " constant offsets that can be promoted.\n";); 2104 return false; 2105 } 2106 2107 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2108 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2109 2110 // Step2: Traverse through MI's basic block and find an anchor(that has the 2111 // same base-registers) with the highest 13bit distance from MI's offset. 2112 // E.g. (64bit loads) 2113 // bb: 2114 // addr1 = &a + 4096; load1 = load(addr1, 0) 2115 // addr2 = &a + 6144; load2 = load(addr2, 0) 2116 // addr3 = &a + 8192; load3 = load(addr3, 0) 2117 // addr4 = &a + 10240; load4 = load(addr4, 0) 2118 // addr5 = &a + 12288; load5 = load(addr5, 0) 2119 // 2120 // Starting from the first load, the optimization will try to find a new base 2121 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2122 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2123 // as the new-base(anchor) because of the maximum distance which can 2124 // accommodate more intermediate bases presumably. 2125 // 2126 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2127 // (&a + 8192) for load1, load2, load4. 2128 // addr = &a + 8192 2129 // load1 = load(addr, -4096) 2130 // load2 = load(addr, -2048) 2131 // load3 = load(addr, 0) 2132 // load4 = load(addr, 2048) 2133 // addr5 = &a + 12288; load5 = load(addr5, 0) 2134 // 2135 MachineInstr *AnchorInst = nullptr; 2136 MemAddress AnchorAddr; 2137 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2138 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2139 2140 MachineBasicBlock *MBB = MI.getParent(); 2141 MachineBasicBlock::iterator E = MBB->end(); 2142 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2143 ++MBBI; 2144 const SITargetLowering *TLI = 2145 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2146 2147 for ( ; MBBI != E; ++MBBI) { 2148 MachineInstr &MINext = *MBBI; 2149 // TODO: Support finding an anchor(with same base) from store addresses or 2150 // any other load addresses where the opcodes are different. 2151 if (MINext.getOpcode() != MI.getOpcode() || 2152 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2153 continue; 2154 2155 const MachineOperand &BaseNext = 2156 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2157 MemAddress MAddrNext; 2158 if (Visited.find(&MINext) == Visited.end()) { 2159 processBaseWithConstOffset(BaseNext, MAddrNext); 2160 Visited[&MINext] = MAddrNext; 2161 } else 2162 MAddrNext = Visited[&MINext]; 2163 2164 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2165 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2166 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2167 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2168 continue; 2169 2170 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset)); 2171 2172 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2173 TargetLoweringBase::AddrMode AM; 2174 AM.HasBaseReg = true; 2175 AM.BaseOffs = Dist; 2176 if (TLI->isLegalGlobalAddressingMode(AM) && 2177 (uint32_t)std::abs(Dist) > MaxDist) { 2178 MaxDist = std::abs(Dist); 2179 2180 AnchorAddr = MAddrNext; 2181 AnchorInst = &MINext; 2182 } 2183 } 2184 2185 if (AnchorInst) { 2186 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2187 AnchorInst->dump()); 2188 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2189 << AnchorAddr.Offset << "\n\n"); 2190 2191 // Instead of moving up, just re-compute anchor-instruction's base address. 2192 Register Base = computeBase(MI, AnchorAddr); 2193 2194 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2195 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2196 2197 for (auto P : InstsWCommonBase) { 2198 TargetLoweringBase::AddrMode AM; 2199 AM.HasBaseReg = true; 2200 AM.BaseOffs = P.second - AnchorAddr.Offset; 2201 2202 if (TLI->isLegalGlobalAddressingMode(AM)) { 2203 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2204 dbgs() << ")"; P.first->dump()); 2205 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2206 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2207 } 2208 } 2209 AnchorList.insert(AnchorInst); 2210 return true; 2211 } 2212 2213 return false; 2214 } 2215 2216 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2217 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2218 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2219 if (AddrList.front().InstClass == CI.InstClass && 2220 AddrList.front().IsAGPR == CI.IsAGPR && 2221 AddrList.front().hasSameBaseAddress(CI)) { 2222 AddrList.emplace_back(CI); 2223 return; 2224 } 2225 } 2226 2227 // Base address not found, so add a new list. 2228 MergeableInsts.emplace_back(1, CI); 2229 } 2230 2231 std::pair<MachineBasicBlock::iterator, bool> 2232 SILoadStoreOptimizer::collectMergeableInsts( 2233 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2234 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2235 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2236 bool Modified = false; 2237 2238 // Sort potential mergeable instructions into lists. One list per base address. 2239 unsigned Order = 0; 2240 MachineBasicBlock::iterator BlockI = Begin; 2241 for (; BlockI != End; ++BlockI) { 2242 MachineInstr &MI = *BlockI; 2243 2244 // We run this before checking if an address is mergeable, because it can produce 2245 // better code even if the instructions aren't mergeable. 2246 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2247 Modified = true; 2248 2249 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2250 // barriers. We can look after this barrier for separate merges. 2251 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2252 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2253 2254 // Search will resume after this instruction in a separate merge list. 2255 ++BlockI; 2256 break; 2257 } 2258 2259 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2260 if (InstClass == UNKNOWN) 2261 continue; 2262 2263 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2264 int Swizzled = 2265 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2266 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2267 continue; 2268 2269 CombineInfo CI; 2270 CI.setMI(MI, *this); 2271 CI.Order = Order++; 2272 2273 if (!CI.hasMergeableAddress(*MRI)) 2274 continue; 2275 2276 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2277 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2278 // operands. However we are reporting that ds_write2 shall have 2279 // only VGPR data so that machine copy propagation does not 2280 // create an illegal instruction with a VGPR and AGPR sources. 2281 // Consequenctially if we create such instruction the verifier 2282 // will complain. 2283 continue; 2284 } 2285 2286 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2287 2288 addInstToMergeableList(CI, MergeableInsts); 2289 } 2290 2291 // At this point we have lists of Mergeable instructions. 2292 // 2293 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2294 // list try to find an instruction that can be merged with I. If an instruction 2295 // is found, it is stored in the Paired field. If no instructions are found, then 2296 // the CombineInfo object is deleted from the list. 2297 2298 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2299 E = MergeableInsts.end(); I != E;) { 2300 2301 std::list<CombineInfo> &MergeList = *I; 2302 if (MergeList.size() <= 1) { 2303 // This means we have found only one instruction with a given address 2304 // that can be merged, and we need at least 2 instructions to do a merge, 2305 // so this list can be discarded. 2306 I = MergeableInsts.erase(I); 2307 continue; 2308 } 2309 2310 // Sort the lists by offsets, this way mergeable instructions will be 2311 // adjacent to each other in the list, which will make it easier to find 2312 // matches. 2313 MergeList.sort( 2314 [] (const CombineInfo &A, const CombineInfo &B) { 2315 return A.Offset < B.Offset; 2316 }); 2317 ++I; 2318 } 2319 2320 return std::pair(BlockI, Modified); 2321 } 2322 2323 // Scan through looking for adjacent LDS operations with constant offsets from 2324 // the same base register. We rely on the scheduler to do the hard work of 2325 // clustering nearby loads, and assume these are all adjacent. 2326 bool SILoadStoreOptimizer::optimizeBlock( 2327 std::list<std::list<CombineInfo> > &MergeableInsts) { 2328 bool Modified = false; 2329 2330 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2331 E = MergeableInsts.end(); I != E;) { 2332 std::list<CombineInfo> &MergeList = *I; 2333 2334 bool OptimizeListAgain = false; 2335 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2336 // We weren't able to make any changes, so delete the list so we don't 2337 // process the same instructions the next time we try to optimize this 2338 // block. 2339 I = MergeableInsts.erase(I); 2340 continue; 2341 } 2342 2343 Modified = true; 2344 2345 // We made changes, but also determined that there were no more optimization 2346 // opportunities, so we don't need to reprocess the list 2347 if (!OptimizeListAgain) { 2348 I = MergeableInsts.erase(I); 2349 continue; 2350 } 2351 OptimizeAgain = true; 2352 } 2353 return Modified; 2354 } 2355 2356 bool 2357 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2358 std::list<CombineInfo> &MergeList, 2359 bool &OptimizeListAgain) { 2360 if (MergeList.empty()) 2361 return false; 2362 2363 bool Modified = false; 2364 2365 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2366 Next = std::next(I)) { 2367 2368 auto First = I; 2369 auto Second = Next; 2370 2371 if ((*First).Order > (*Second).Order) 2372 std::swap(First, Second); 2373 CombineInfo &CI = *First; 2374 CombineInfo &Paired = *Second; 2375 2376 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2377 if (!Where) { 2378 ++I; 2379 continue; 2380 } 2381 2382 Modified = true; 2383 2384 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2385 2386 MachineBasicBlock::iterator NewMI; 2387 switch (CI.InstClass) { 2388 default: 2389 llvm_unreachable("unknown InstClass"); 2390 break; 2391 case DS_READ: 2392 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2393 break; 2394 case DS_WRITE: 2395 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2396 break; 2397 case S_BUFFER_LOAD_IMM: 2398 case S_BUFFER_LOAD_SGPR_IMM: 2399 case S_LOAD_IMM: 2400 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I); 2401 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2402 break; 2403 case BUFFER_LOAD: 2404 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2405 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2406 break; 2407 case BUFFER_STORE: 2408 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2409 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2410 break; 2411 case MIMG: 2412 NewMI = mergeImagePair(CI, Paired, Where->I); 2413 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2414 break; 2415 case TBUFFER_LOAD: 2416 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2417 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2418 break; 2419 case TBUFFER_STORE: 2420 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2421 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2422 break; 2423 case FLAT_LOAD: 2424 case GLOBAL_LOAD: 2425 case GLOBAL_LOAD_SADDR: 2426 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2427 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2428 break; 2429 case FLAT_STORE: 2430 case GLOBAL_STORE: 2431 case GLOBAL_STORE_SADDR: 2432 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2433 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2434 break; 2435 } 2436 CI.setMI(NewMI, *this); 2437 CI.Order = Where->Order; 2438 if (I == Second) 2439 I = Next; 2440 2441 MergeList.erase(Second); 2442 } 2443 2444 return Modified; 2445 } 2446 2447 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2448 if (skipFunction(MF.getFunction())) 2449 return false; 2450 2451 STM = &MF.getSubtarget<GCNSubtarget>(); 2452 if (!STM->loadStoreOptEnabled()) 2453 return false; 2454 2455 TII = STM->getInstrInfo(); 2456 TRI = &TII->getRegisterInfo(); 2457 2458 MRI = &MF.getRegInfo(); 2459 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2460 2461 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2462 2463 bool Modified = false; 2464 2465 // Contains the list of instructions for which constant offsets are being 2466 // promoted to the IMM. This is tracked for an entire block at time. 2467 SmallPtrSet<MachineInstr *, 4> AnchorList; 2468 MemInfoMap Visited; 2469 2470 for (MachineBasicBlock &MBB : MF) { 2471 MachineBasicBlock::iterator SectionEnd; 2472 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2473 I = SectionEnd) { 2474 bool CollectModified; 2475 std::list<std::list<CombineInfo>> MergeableInsts; 2476 2477 // First pass: Collect list of all instructions we know how to merge in a 2478 // subset of the block. 2479 std::tie(SectionEnd, CollectModified) = 2480 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2481 2482 Modified |= CollectModified; 2483 2484 do { 2485 OptimizeAgain = false; 2486 Modified |= optimizeBlock(MergeableInsts); 2487 } while (OptimizeAgain); 2488 } 2489 2490 Visited.clear(); 2491 AnchorList.clear(); 2492 } 2493 2494 return Modified; 2495 } 2496