1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "SIInstrInfo.h" 64 #include "SIRegisterInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/StringRef.h" 69 #include "llvm/Analysis/AliasAnalysis.h" 70 #include "llvm/CodeGen/MachineBasicBlock.h" 71 #include "llvm/CodeGen/MachineFunction.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstr.h" 74 #include "llvm/CodeGen/MachineInstrBuilder.h" 75 #include "llvm/CodeGen/MachineOperand.h" 76 #include "llvm/CodeGen/MachineRegisterInfo.h" 77 #include "llvm/IR/DebugLoc.h" 78 #include "llvm/InitializePasses.h" 79 #include "llvm/Pass.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include <algorithm> 84 #include <cassert> 85 #include <cstdlib> 86 #include <iterator> 87 #include <utility> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "si-load-store-opt" 92 93 namespace { 94 enum InstClassEnum { 95 UNKNOWN, 96 DS_READ, 97 DS_WRITE, 98 S_BUFFER_LOAD_IMM, 99 BUFFER_LOAD, 100 BUFFER_STORE, 101 MIMG, 102 TBUFFER_LOAD, 103 TBUFFER_STORE, 104 }; 105 106 struct AddressRegs { 107 unsigned char NumVAddrs = 0; 108 bool SBase = false; 109 bool SRsrc = false; 110 bool SOffset = false; 111 bool VAddr = false; 112 bool Addr = false; 113 bool SSamp = false; 114 }; 115 116 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 117 const unsigned MaxAddressRegs = 12 + 1 + 1; 118 119 class SILoadStoreOptimizer : public MachineFunctionPass { 120 struct CombineInfo { 121 MachineBasicBlock::iterator I; 122 unsigned EltSize; 123 unsigned Offset; 124 unsigned Width; 125 unsigned Format; 126 unsigned BaseOff; 127 unsigned DMask; 128 InstClassEnum InstClass; 129 bool GLC; 130 bool SLC; 131 bool DLC; 132 bool UseST64; 133 int AddrIdx[MaxAddressRegs]; 134 const MachineOperand *AddrReg[MaxAddressRegs]; 135 unsigned NumAddresses; 136 unsigned Order; 137 138 bool hasSameBaseAddress(const MachineInstr &MI) { 139 for (unsigned i = 0; i < NumAddresses; i++) { 140 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 141 142 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 143 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 144 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 145 return false; 146 } 147 continue; 148 } 149 150 // Check same base pointer. Be careful of subregisters, which can occur 151 // with vectors of pointers. 152 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 153 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 154 return false; 155 } 156 } 157 return true; 158 } 159 160 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 161 for (unsigned i = 0; i < NumAddresses; ++i) { 162 const MachineOperand *AddrOp = AddrReg[i]; 163 // Immediates are always OK. 164 if (AddrOp->isImm()) 165 continue; 166 167 // Don't try to merge addresses that aren't either immediates or registers. 168 // TODO: Should be possible to merge FrameIndexes and maybe some other 169 // non-register 170 if (!AddrOp->isReg()) 171 return false; 172 173 // TODO: We should be able to merge physical reg addreses. 174 if (Register::isPhysicalRegister(AddrOp->getReg())) 175 return false; 176 177 // If an address has only one use then there will be on other 178 // instructions with the same address, so we can't merge this one. 179 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 180 return false; 181 } 182 return true; 183 } 184 185 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 186 const GCNSubtarget &STM); 187 }; 188 189 struct BaseRegisters { 190 Register LoReg; 191 Register HiReg; 192 193 unsigned LoSubReg = 0; 194 unsigned HiSubReg = 0; 195 }; 196 197 struct MemAddress { 198 BaseRegisters Base; 199 int64_t Offset = 0; 200 }; 201 202 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 203 204 private: 205 const GCNSubtarget *STM = nullptr; 206 const SIInstrInfo *TII = nullptr; 207 const SIRegisterInfo *TRI = nullptr; 208 MachineRegisterInfo *MRI = nullptr; 209 AliasAnalysis *AA = nullptr; 210 bool OptimizeAgain; 211 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 225 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 226 SmallVectorImpl<MachineInstr *> &InstsToMove); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 231 CombineInfo &Paired, 232 const SmallVectorImpl<MachineInstr *> &InstsToMove); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 const SmallVectorImpl<MachineInstr *> &InstsToMove); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 const SmallVectorImpl<MachineInstr *> &InstsToMove); 242 MachineBasicBlock::iterator 243 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 const SmallVectorImpl<MachineInstr *> &InstsToMove); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 const SmallVectorImpl<MachineInstr *> &InstsToMove); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 const SmallVectorImpl<MachineInstr *> &InstsToMove); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 const SmallVectorImpl<MachineInstr *> &InstsToMove); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 const SmallVectorImpl<MachineInstr *> &InstsToMove); 257 258 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 259 int32_t NewOffset) const; 260 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 261 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 262 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 263 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 264 /// Promotes constant offset to the immediate by adjusting the base. It 265 /// tries to use a base from the nearby instructions that allows it to have 266 /// a 13bit constant offset which gets promoted to the immediate. 267 bool promoteConstantOffsetToImm(MachineInstr &CI, 268 MemInfoMap &Visited, 269 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 270 void addInstToMergeableList(const CombineInfo &CI, 271 std::list<std::list<CombineInfo> > &MergeableInsts) const; 272 273 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 274 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 275 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 276 std::list<std::list<CombineInfo>> &MergeableInsts) const; 277 278 public: 279 static char ID; 280 281 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 282 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 283 } 284 285 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 286 bool &OptimizeListAgain); 287 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 288 289 bool runOnMachineFunction(MachineFunction &MF) override; 290 291 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 292 293 void getAnalysisUsage(AnalysisUsage &AU) const override { 294 AU.setPreservesCFG(); 295 AU.addRequired<AAResultsWrapperPass>(); 296 297 MachineFunctionPass::getAnalysisUsage(AU); 298 } 299 300 MachineFunctionProperties getRequiredProperties() const override { 301 return MachineFunctionProperties() 302 .set(MachineFunctionProperties::Property::IsSSA); 303 } 304 }; 305 306 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 307 const unsigned Opc = MI.getOpcode(); 308 309 if (TII.isMUBUF(Opc)) { 310 // FIXME: Handle d16 correctly 311 return AMDGPU::getMUBUFElements(Opc); 312 } 313 if (TII.isMIMG(MI)) { 314 uint64_t DMaskImm = 315 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 316 return countPopulation(DMaskImm); 317 } 318 if (TII.isMTBUF(Opc)) { 319 return AMDGPU::getMTBUFElements(Opc); 320 } 321 322 switch (Opc) { 323 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 324 return 1; 325 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 326 return 2; 327 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 328 return 4; 329 default: 330 return 0; 331 } 332 } 333 334 /// Maps instruction opcode to enum InstClassEnum. 335 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 336 switch (Opc) { 337 default: 338 if (TII.isMUBUF(Opc)) { 339 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 340 default: 341 return UNKNOWN; 342 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 343 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 344 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 345 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 346 return BUFFER_LOAD; 347 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 348 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 349 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 350 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 351 return BUFFER_STORE; 352 } 353 } 354 if (TII.isMIMG(Opc)) { 355 // Ignore instructions encoded without vaddr. 356 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 357 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 358 return UNKNOWN; 359 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 360 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 361 TII.isGather4(Opc)) 362 return UNKNOWN; 363 return MIMG; 364 } 365 if (TII.isMTBUF(Opc)) { 366 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 367 default: 368 return UNKNOWN; 369 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 370 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 371 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 372 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 373 return TBUFFER_LOAD; 374 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 375 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 376 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 377 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 378 return TBUFFER_STORE; 379 } 380 } 381 return UNKNOWN; 382 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 383 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 385 return S_BUFFER_LOAD_IMM; 386 case AMDGPU::DS_READ_B32: 387 case AMDGPU::DS_READ_B32_gfx9: 388 case AMDGPU::DS_READ_B64: 389 case AMDGPU::DS_READ_B64_gfx9: 390 return DS_READ; 391 case AMDGPU::DS_WRITE_B32: 392 case AMDGPU::DS_WRITE_B32_gfx9: 393 case AMDGPU::DS_WRITE_B64: 394 case AMDGPU::DS_WRITE_B64_gfx9: 395 return DS_WRITE; 396 } 397 } 398 399 /// Determines instruction subclass from opcode. Only instructions 400 /// of the same subclass can be merged together. 401 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 402 switch (Opc) { 403 default: 404 if (TII.isMUBUF(Opc)) 405 return AMDGPU::getMUBUFBaseOpcode(Opc); 406 if (TII.isMIMG(Opc)) { 407 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 408 assert(Info); 409 return Info->BaseOpcode; 410 } 411 if (TII.isMTBUF(Opc)) 412 return AMDGPU::getMTBUFBaseOpcode(Opc); 413 return -1; 414 case AMDGPU::DS_READ_B32: 415 case AMDGPU::DS_READ_B32_gfx9: 416 case AMDGPU::DS_READ_B64: 417 case AMDGPU::DS_READ_B64_gfx9: 418 case AMDGPU::DS_WRITE_B32: 419 case AMDGPU::DS_WRITE_B32_gfx9: 420 case AMDGPU::DS_WRITE_B64: 421 case AMDGPU::DS_WRITE_B64_gfx9: 422 return Opc; 423 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 424 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 425 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 426 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 427 } 428 } 429 430 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 431 AddressRegs Result; 432 433 if (TII.isMUBUF(Opc)) { 434 if (AMDGPU::getMUBUFHasVAddr(Opc)) 435 Result.VAddr = true; 436 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 437 Result.SRsrc = true; 438 if (AMDGPU::getMUBUFHasSoffset(Opc)) 439 Result.SOffset = true; 440 441 return Result; 442 } 443 444 if (TII.isMIMG(Opc)) { 445 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 446 if (VAddr0Idx >= 0) { 447 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 448 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 449 } else { 450 Result.VAddr = true; 451 } 452 Result.SRsrc = true; 453 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 454 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 455 Result.SSamp = true; 456 457 return Result; 458 } 459 if (TII.isMTBUF(Opc)) { 460 if (AMDGPU::getMTBUFHasVAddr(Opc)) 461 Result.VAddr = true; 462 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 463 Result.SRsrc = true; 464 if (AMDGPU::getMTBUFHasSoffset(Opc)) 465 Result.SOffset = true; 466 467 return Result; 468 } 469 470 switch (Opc) { 471 default: 472 return Result; 473 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 474 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 475 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 476 Result.SBase = true; 477 return Result; 478 case AMDGPU::DS_READ_B32: 479 case AMDGPU::DS_READ_B64: 480 case AMDGPU::DS_READ_B32_gfx9: 481 case AMDGPU::DS_READ_B64_gfx9: 482 case AMDGPU::DS_WRITE_B32: 483 case AMDGPU::DS_WRITE_B64: 484 case AMDGPU::DS_WRITE_B32_gfx9: 485 case AMDGPU::DS_WRITE_B64_gfx9: 486 Result.Addr = true; 487 return Result; 488 } 489 } 490 491 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 492 const SIInstrInfo &TII, 493 const GCNSubtarget &STM) { 494 I = MI; 495 unsigned Opc = MI->getOpcode(); 496 InstClass = getInstClass(Opc, TII); 497 498 if (InstClass == UNKNOWN) 499 return; 500 501 switch (InstClass) { 502 case DS_READ: 503 EltSize = 504 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 505 : 4; 506 break; 507 case DS_WRITE: 508 EltSize = 509 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 510 : 4; 511 break; 512 case S_BUFFER_LOAD_IMM: 513 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 514 break; 515 default: 516 EltSize = 4; 517 break; 518 } 519 520 if (InstClass == MIMG) { 521 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 522 // Offset is not considered for MIMG instructions. 523 Offset = 0; 524 } else { 525 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 526 Offset = I->getOperand(OffsetIdx).getImm(); 527 } 528 529 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 530 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 531 532 Width = getOpcodeWidth(*I, TII); 533 534 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 535 Offset &= 0xffff; 536 } else if (InstClass != MIMG) { 537 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 538 if (InstClass != S_BUFFER_LOAD_IMM) { 539 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 540 } 541 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 542 } 543 544 AddressRegs Regs = getRegs(Opc, TII); 545 546 NumAddresses = 0; 547 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 548 AddrIdx[NumAddresses++] = 549 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 550 if (Regs.Addr) 551 AddrIdx[NumAddresses++] = 552 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 553 if (Regs.SBase) 554 AddrIdx[NumAddresses++] = 555 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 556 if (Regs.SRsrc) 557 AddrIdx[NumAddresses++] = 558 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 559 if (Regs.SOffset) 560 AddrIdx[NumAddresses++] = 561 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 562 if (Regs.VAddr) 563 AddrIdx[NumAddresses++] = 564 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 565 if (Regs.SSamp) 566 AddrIdx[NumAddresses++] = 567 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 568 assert(NumAddresses <= MaxAddressRegs); 569 570 for (unsigned J = 0; J < NumAddresses; J++) 571 AddrReg[J] = &I->getOperand(AddrIdx[J]); 572 } 573 574 } // end anonymous namespace. 575 576 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 577 "SI Load Store Optimizer", false, false) 578 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 579 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 580 false, false) 581 582 char SILoadStoreOptimizer::ID = 0; 583 584 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 585 586 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 587 return new SILoadStoreOptimizer(); 588 } 589 590 static void moveInstsAfter(MachineBasicBlock::iterator I, 591 ArrayRef<MachineInstr *> InstsToMove) { 592 MachineBasicBlock *MBB = I->getParent(); 593 ++I; 594 for (MachineInstr *MI : InstsToMove) { 595 MI->removeFromParent(); 596 MBB->insert(I, MI); 597 } 598 } 599 600 static void addDefsUsesToList(const MachineInstr &MI, 601 DenseSet<Register> &RegDefs, 602 DenseSet<Register> &PhysRegUses) { 603 for (const MachineOperand &Op : MI.operands()) { 604 if (Op.isReg()) { 605 if (Op.isDef()) 606 RegDefs.insert(Op.getReg()); 607 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) 608 PhysRegUses.insert(Op.getReg()); 609 } 610 } 611 } 612 613 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 614 MachineBasicBlock::iterator B, 615 AliasAnalysis *AA) { 616 // RAW or WAR - cannot reorder 617 // WAW - cannot reorder 618 // RAR - safe to reorder 619 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 620 } 621 622 // Add MI and its defs to the lists if MI reads one of the defs that are 623 // already in the list. Returns true in that case. 624 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 625 DenseSet<Register> &PhysRegUses, 626 SmallVectorImpl<MachineInstr *> &Insts) { 627 for (MachineOperand &Use : MI.operands()) { 628 // If one of the defs is read, then there is a use of Def between I and the 629 // instruction that I will potentially be merged with. We will need to move 630 // this instruction after the merged instructions. 631 // 632 // Similarly, if there is a def which is read by an instruction that is to 633 // be moved for merging, then we need to move the def-instruction as well. 634 // This can only happen for physical registers such as M0; virtual 635 // registers are in SSA form. 636 if (Use.isReg() && 637 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 638 (Use.isDef() && RegDefs.count(Use.getReg())) || 639 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && 640 PhysRegUses.count(Use.getReg())))) { 641 Insts.push_back(&MI); 642 addDefsUsesToList(MI, RegDefs, PhysRegUses); 643 return true; 644 } 645 } 646 647 return false; 648 } 649 650 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 651 ArrayRef<MachineInstr *> InstsToMove, 652 AliasAnalysis *AA) { 653 assert(MemOp.mayLoadOrStore()); 654 655 for (MachineInstr *InstToMove : InstsToMove) { 656 if (!InstToMove->mayLoadOrStore()) 657 continue; 658 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 659 return false; 660 } 661 return true; 662 } 663 664 // This function assumes that \p A and \p B have are identical except for 665 // size and offset, and they referecne adjacent memory. 666 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 667 const MachineMemOperand *A, 668 const MachineMemOperand *B) { 669 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 670 unsigned Size = A->getSize() + B->getSize(); 671 // This function adds the offset parameter to the existing offset for A, 672 // so we pass 0 here as the offset and then manually set it to the correct 673 // value after the call. 674 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 675 MMO->setOffset(MinOffset); 676 return MMO; 677 } 678 679 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 680 const SIInstrInfo &TII, 681 const CombineInfo &Paired) { 682 assert(CI.InstClass == MIMG); 683 684 // Ignore instructions with tfe/lwe set. 685 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 686 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 687 688 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 689 return false; 690 691 // Check other optional immediate operands for equality. 692 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 693 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 694 AMDGPU::OpName::da, AMDGPU::OpName::r128, 695 AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; 696 697 for (auto op : OperandsToMatch) { 698 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 699 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 700 return false; 701 if (Idx != -1 && 702 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 703 return false; 704 } 705 706 // Check DMask for overlaps. 707 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 708 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 709 710 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 711 if ((1u << AllowedBitsForMin) <= MinMask) 712 return false; 713 714 return true; 715 } 716 717 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 718 unsigned ComponentCount, 719 const GCNSubtarget &STI) { 720 if (ComponentCount > 4) 721 return 0; 722 723 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 724 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 725 if (!OldFormatInfo) 726 return 0; 727 728 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 729 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 730 ComponentCount, 731 OldFormatInfo->NumFormat, STI); 732 733 if (!NewFormatInfo) 734 return 0; 735 736 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 737 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 738 739 return NewFormatInfo->Format; 740 } 741 742 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 743 const GCNSubtarget &STI, 744 CombineInfo &Paired, 745 bool Modify) { 746 assert(CI.InstClass != MIMG); 747 748 // XXX - Would the same offset be OK? Is there any reason this would happen or 749 // be useful? 750 if (CI.Offset == Paired.Offset) 751 return false; 752 753 // This won't be valid if the offset isn't aligned. 754 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 755 return false; 756 757 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 758 759 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 760 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 761 if (!Info0) 762 return false; 763 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 764 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 765 if (!Info1) 766 return false; 767 768 if (Info0->BitsPerComp != Info1->BitsPerComp || 769 Info0->NumFormat != Info1->NumFormat) 770 return false; 771 772 // TODO: Should be possible to support more formats, but if format loads 773 // are not dword-aligned, the merged load might not be valid. 774 if (Info0->BitsPerComp != 32) 775 return false; 776 777 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 778 return false; 779 } 780 781 unsigned EltOffset0 = CI.Offset / CI.EltSize; 782 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 783 CI.UseST64 = false; 784 CI.BaseOff = 0; 785 786 // Handle DS instructions. 787 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 788 return (EltOffset0 + CI.Width == EltOffset1 || 789 EltOffset1 + Paired.Width == EltOffset0) && 790 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 791 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 792 } 793 794 // Handle SMEM and VMEM instructions. 795 // If the offset in elements doesn't fit in 8-bits, we might be able to use 796 // the stride 64 versions. 797 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 798 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 799 if (Modify) { 800 CI.Offset = EltOffset0 / 64; 801 Paired.Offset = EltOffset1 / 64; 802 CI.UseST64 = true; 803 } 804 return true; 805 } 806 807 // Check if the new offsets fit in the reduced 8-bit range. 808 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 809 if (Modify) { 810 CI.Offset = EltOffset0; 811 Paired.Offset = EltOffset1; 812 } 813 return true; 814 } 815 816 // Try to shift base address to decrease offsets. 817 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 818 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 819 820 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 821 if (Modify) { 822 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 823 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 824 CI.UseST64 = true; 825 } 826 return true; 827 } 828 829 if (isUInt<8>(OffsetDiff)) { 830 if (Modify) { 831 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 832 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 833 } 834 return true; 835 } 836 837 return false; 838 } 839 840 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 841 const CombineInfo &CI, 842 const CombineInfo &Paired) { 843 const unsigned Width = (CI.Width + Paired.Width); 844 switch (CI.InstClass) { 845 default: 846 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 847 case S_BUFFER_LOAD_IMM: 848 switch (Width) { 849 default: 850 return false; 851 case 2: 852 case 4: 853 return true; 854 } 855 } 856 } 857 858 /// This function assumes that CI comes before Paired in a basic block. 859 bool SILoadStoreOptimizer::checkAndPrepareMerge( 860 CombineInfo &CI, CombineInfo &Paired, 861 SmallVectorImpl<MachineInstr *> &InstsToMove) { 862 863 // Check both offsets (or masks for MIMG) can be combined and fit in the 864 // reduced range. 865 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 866 return false; 867 868 if (CI.InstClass != MIMG && 869 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 870 return false; 871 872 const unsigned Opc = CI.I->getOpcode(); 873 const InstClassEnum InstClass = getInstClass(Opc, *TII); 874 875 if (InstClass == UNKNOWN) { 876 return false; 877 } 878 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 879 880 // Do not merge VMEM buffer instructions with "swizzled" bit set. 881 int Swizzled = 882 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 883 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 884 return false; 885 886 DenseSet<Register> RegDefsToMove; 887 DenseSet<Register> PhysRegUsesToMove; 888 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 889 890 MachineBasicBlock::iterator E = std::next(Paired.I); 891 MachineBasicBlock::iterator MBBI = std::next(CI.I); 892 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 893 for (; MBBI != E; ++MBBI) { 894 895 if (MBBI == MBBE) { 896 // CombineInfo::Order is a hint on the instruction ordering within the 897 // basic block. This hint suggests that CI precedes Paired, which is 898 // true most of the time. However, moveInstsAfter() processing a 899 // previous list may have changed this order in a situation when it 900 // moves an instruction which exists in some other merge list. 901 // In this case it must be dependent. 902 return false; 903 } 904 905 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 906 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 907 // This is not a matching instruction, but we can keep looking as 908 // long as one of these conditions are met: 909 // 1. It is safe to move I down past MBBI. 910 // 2. It is safe to move MBBI down past the instruction that I will 911 // be merged into. 912 913 if (MBBI->hasUnmodeledSideEffects()) { 914 // We can't re-order this instruction with respect to other memory 915 // operations, so we fail both conditions mentioned above. 916 return false; 917 } 918 919 if (MBBI->mayLoadOrStore() && 920 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 921 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 922 // We fail condition #1, but we may still be able to satisfy condition 923 // #2. Add this instruction to the move list and then we will check 924 // if condition #2 holds once we have selected the matching instruction. 925 InstsToMove.push_back(&*MBBI); 926 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 927 continue; 928 } 929 930 // When we match I with another DS instruction we will be moving I down 931 // to the location of the matched instruction any uses of I will need to 932 // be moved down as well. 933 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 934 InstsToMove); 935 continue; 936 } 937 938 // Don't merge volatiles. 939 if (MBBI->hasOrderedMemoryRef()) 940 return false; 941 942 int Swizzled = 943 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 944 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 945 return false; 946 947 // Handle a case like 948 // DS_WRITE_B32 addr, v, idx0 949 // w = DS_READ_B32 addr, idx0 950 // DS_WRITE_B32 addr, f(w), idx1 951 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 952 // merging of the two writes. 953 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 954 InstsToMove)) 955 continue; 956 957 if (&*MBBI == &*Paired.I) { 958 // We need to go through the list of instructions that we plan to 959 // move and make sure they are all safe to move down past the merged 960 // instruction. 961 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 962 963 // Call offsetsCanBeCombined with modify = true so that the offsets are 964 // correct for the new instruction. This should return true, because 965 // this function should only be called on CombineInfo objects that 966 // have already been confirmed to be mergeable. 967 if (CI.InstClass != MIMG) 968 offsetsCanBeCombined(CI, *STM, Paired, true); 969 return true; 970 } 971 return false; 972 } 973 974 // We've found a load/store that we couldn't merge for some reason. 975 // We could potentially keep looking, but we'd need to make sure that 976 // it was safe to move I and also all the instruction in InstsToMove 977 // down past this instruction. 978 // check if we can move I across MBBI and if we can move all I's users 979 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 980 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 981 break; 982 } 983 return false; 984 } 985 986 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 987 if (STM->ldsRequiresM0Init()) 988 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 989 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 990 } 991 992 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 993 if (STM->ldsRequiresM0Init()) 994 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 995 996 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 997 : AMDGPU::DS_READ2ST64_B64_gfx9; 998 } 999 1000 MachineBasicBlock::iterator 1001 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1002 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1003 MachineBasicBlock *MBB = CI.I->getParent(); 1004 1005 // Be careful, since the addresses could be subregisters themselves in weird 1006 // cases, like vectors of pointers. 1007 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1008 1009 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1010 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1011 1012 unsigned NewOffset0 = CI.Offset; 1013 unsigned NewOffset1 = Paired.Offset; 1014 unsigned Opc = 1015 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1016 1017 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1018 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1019 1020 if (NewOffset0 > NewOffset1) { 1021 // Canonicalize the merged instruction so the smaller offset comes first. 1022 std::swap(NewOffset0, NewOffset1); 1023 std::swap(SubRegIdx0, SubRegIdx1); 1024 } 1025 1026 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1027 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1028 1029 const MCInstrDesc &Read2Desc = TII->get(Opc); 1030 1031 const TargetRegisterClass *SuperRC = 1032 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1033 Register DestReg = MRI->createVirtualRegister(SuperRC); 1034 1035 DebugLoc DL = CI.I->getDebugLoc(); 1036 1037 Register BaseReg = AddrReg->getReg(); 1038 unsigned BaseSubReg = AddrReg->getSubReg(); 1039 unsigned BaseRegFlags = 0; 1040 if (CI.BaseOff) { 1041 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1042 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1043 .addImm(CI.BaseOff); 1044 1045 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1046 BaseRegFlags = RegState::Kill; 1047 1048 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1049 .addReg(ImmReg) 1050 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1051 .addImm(0); // clamp bit 1052 BaseSubReg = 0; 1053 } 1054 1055 MachineInstrBuilder Read2 = 1056 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1057 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1058 .addImm(NewOffset0) // offset0 1059 .addImm(NewOffset1) // offset1 1060 .addImm(0) // gds 1061 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1062 1063 (void)Read2; 1064 1065 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1066 1067 // Copy to the old destination registers. 1068 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1069 .add(*Dest0) // Copy to same destination including flags and sub reg. 1070 .addReg(DestReg, 0, SubRegIdx0); 1071 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1072 .add(*Dest1) 1073 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1074 1075 moveInstsAfter(Copy1, InstsToMove); 1076 1077 CI.I->eraseFromParent(); 1078 Paired.I->eraseFromParent(); 1079 1080 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1081 return Read2; 1082 } 1083 1084 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1085 if (STM->ldsRequiresM0Init()) 1086 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1087 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1088 : AMDGPU::DS_WRITE2_B64_gfx9; 1089 } 1090 1091 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1092 if (STM->ldsRequiresM0Init()) 1093 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1094 : AMDGPU::DS_WRITE2ST64_B64; 1095 1096 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1097 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1098 } 1099 1100 MachineBasicBlock::iterator 1101 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1102 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1103 MachineBasicBlock *MBB = CI.I->getParent(); 1104 1105 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1106 // sure we preserve the subregister index and any register flags set on them. 1107 const MachineOperand *AddrReg = 1108 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1109 const MachineOperand *Data0 = 1110 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1111 const MachineOperand *Data1 = 1112 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1113 1114 unsigned NewOffset0 = CI.Offset; 1115 unsigned NewOffset1 = Paired.Offset; 1116 unsigned Opc = 1117 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1118 1119 if (NewOffset0 > NewOffset1) { 1120 // Canonicalize the merged instruction so the smaller offset comes first. 1121 std::swap(NewOffset0, NewOffset1); 1122 std::swap(Data0, Data1); 1123 } 1124 1125 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1126 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1127 1128 const MCInstrDesc &Write2Desc = TII->get(Opc); 1129 DebugLoc DL = CI.I->getDebugLoc(); 1130 1131 Register BaseReg = AddrReg->getReg(); 1132 unsigned BaseSubReg = AddrReg->getSubReg(); 1133 unsigned BaseRegFlags = 0; 1134 if (CI.BaseOff) { 1135 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1136 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1137 .addImm(CI.BaseOff); 1138 1139 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1140 BaseRegFlags = RegState::Kill; 1141 1142 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1143 .addReg(ImmReg) 1144 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1145 .addImm(0); // clamp bit 1146 BaseSubReg = 0; 1147 } 1148 1149 MachineInstrBuilder Write2 = 1150 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1151 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1152 .add(*Data0) // data0 1153 .add(*Data1) // data1 1154 .addImm(NewOffset0) // offset0 1155 .addImm(NewOffset1) // offset1 1156 .addImm(0) // gds 1157 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1158 1159 moveInstsAfter(Write2, InstsToMove); 1160 1161 CI.I->eraseFromParent(); 1162 Paired.I->eraseFromParent(); 1163 1164 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1165 return Write2; 1166 } 1167 1168 MachineBasicBlock::iterator 1169 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1170 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1171 MachineBasicBlock *MBB = CI.I->getParent(); 1172 DebugLoc DL = CI.I->getDebugLoc(); 1173 const unsigned Opcode = getNewOpcode(CI, Paired); 1174 1175 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1176 1177 Register DestReg = MRI->createVirtualRegister(SuperRC); 1178 unsigned MergedDMask = CI.DMask | Paired.DMask; 1179 unsigned DMaskIdx = 1180 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1181 1182 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1183 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1184 if (I == DMaskIdx) 1185 MIB.addImm(MergedDMask); 1186 else 1187 MIB.add((*CI.I).getOperand(I)); 1188 } 1189 1190 // It shouldn't be possible to get this far if the two instructions 1191 // don't have a single memoperand, because MachineInstr::mayAlias() 1192 // will return true if this is the case. 1193 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1194 1195 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1196 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1197 1198 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1199 1200 unsigned SubRegIdx0, SubRegIdx1; 1201 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1202 1203 // Copy to the old destination registers. 1204 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1205 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1206 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1207 1208 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1209 .add(*Dest0) // Copy to same destination including flags and sub reg. 1210 .addReg(DestReg, 0, SubRegIdx0); 1211 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1212 .add(*Dest1) 1213 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1214 1215 moveInstsAfter(Copy1, InstsToMove); 1216 1217 CI.I->eraseFromParent(); 1218 Paired.I->eraseFromParent(); 1219 return New; 1220 } 1221 1222 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1223 CombineInfo &CI, CombineInfo &Paired, 1224 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1225 MachineBasicBlock *MBB = CI.I->getParent(); 1226 DebugLoc DL = CI.I->getDebugLoc(); 1227 const unsigned Opcode = getNewOpcode(CI, Paired); 1228 1229 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1230 1231 Register DestReg = MRI->createVirtualRegister(SuperRC); 1232 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1233 1234 // It shouldn't be possible to get this far if the two instructions 1235 // don't have a single memoperand, because MachineInstr::mayAlias() 1236 // will return true if this is the case. 1237 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1238 1239 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1240 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1241 1242 MachineInstr *New = 1243 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1244 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1245 .addImm(MergedOffset) // offset 1246 .addImm(CI.GLC) // glc 1247 .addImm(CI.DLC) // dlc 1248 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1249 1250 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1251 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1252 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1253 1254 // Copy to the old destination registers. 1255 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1256 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1257 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1258 1259 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1260 .add(*Dest0) // Copy to same destination including flags and sub reg. 1261 .addReg(DestReg, 0, SubRegIdx0); 1262 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1263 .add(*Dest1) 1264 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1265 1266 moveInstsAfter(Copy1, InstsToMove); 1267 1268 CI.I->eraseFromParent(); 1269 Paired.I->eraseFromParent(); 1270 return New; 1271 } 1272 1273 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1274 CombineInfo &CI, CombineInfo &Paired, 1275 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1276 MachineBasicBlock *MBB = CI.I->getParent(); 1277 DebugLoc DL = CI.I->getDebugLoc(); 1278 1279 const unsigned Opcode = getNewOpcode(CI, Paired); 1280 1281 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1282 1283 // Copy to the new source register. 1284 Register DestReg = MRI->createVirtualRegister(SuperRC); 1285 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1286 1287 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1288 1289 AddressRegs Regs = getRegs(Opcode, *TII); 1290 1291 if (Regs.VAddr) 1292 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1293 1294 // It shouldn't be possible to get this far if the two instructions 1295 // don't have a single memoperand, because MachineInstr::mayAlias() 1296 // will return true if this is the case. 1297 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1298 1299 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1300 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1301 1302 MachineInstr *New = 1303 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1304 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1305 .addImm(MergedOffset) // offset 1306 .addImm(CI.GLC) // glc 1307 .addImm(CI.SLC) // slc 1308 .addImm(0) // tfe 1309 .addImm(CI.DLC) // dlc 1310 .addImm(0) // swz 1311 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1312 1313 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1314 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1315 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1316 1317 // Copy to the old destination registers. 1318 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1319 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1320 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1321 1322 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1323 .add(*Dest0) // Copy to same destination including flags and sub reg. 1324 .addReg(DestReg, 0, SubRegIdx0); 1325 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1326 .add(*Dest1) 1327 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1328 1329 moveInstsAfter(Copy1, InstsToMove); 1330 1331 CI.I->eraseFromParent(); 1332 Paired.I->eraseFromParent(); 1333 return New; 1334 } 1335 1336 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1337 CombineInfo &CI, CombineInfo &Paired, 1338 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1339 MachineBasicBlock *MBB = CI.I->getParent(); 1340 DebugLoc DL = CI.I->getDebugLoc(); 1341 1342 const unsigned Opcode = getNewOpcode(CI, Paired); 1343 1344 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1345 1346 // Copy to the new source register. 1347 Register DestReg = MRI->createVirtualRegister(SuperRC); 1348 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1349 1350 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1351 1352 AddressRegs Regs = getRegs(Opcode, *TII); 1353 1354 if (Regs.VAddr) 1355 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1356 1357 unsigned JoinedFormat = 1358 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1359 1360 // It shouldn't be possible to get this far if the two instructions 1361 // don't have a single memoperand, because MachineInstr::mayAlias() 1362 // will return true if this is the case. 1363 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1364 1365 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1366 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1367 1368 MachineInstr *New = 1369 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1370 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1371 .addImm(MergedOffset) // offset 1372 .addImm(JoinedFormat) // format 1373 .addImm(CI.GLC) // glc 1374 .addImm(CI.SLC) // slc 1375 .addImm(0) // tfe 1376 .addImm(CI.DLC) // dlc 1377 .addImm(0) // swz 1378 .addMemOperand( 1379 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1380 1381 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1382 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1383 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1384 1385 // Copy to the old destination registers. 1386 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1387 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1388 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1389 1390 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1391 .add(*Dest0) // Copy to same destination including flags and sub reg. 1392 .addReg(DestReg, 0, SubRegIdx0); 1393 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1394 .add(*Dest1) 1395 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1396 1397 moveInstsAfter(Copy1, InstsToMove); 1398 1399 CI.I->eraseFromParent(); 1400 Paired.I->eraseFromParent(); 1401 return New; 1402 } 1403 1404 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1405 CombineInfo &CI, CombineInfo &Paired, 1406 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1407 MachineBasicBlock *MBB = CI.I->getParent(); 1408 DebugLoc DL = CI.I->getDebugLoc(); 1409 1410 const unsigned Opcode = getNewOpcode(CI, Paired); 1411 1412 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1413 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1414 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1415 1416 // Copy to the new source register. 1417 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1418 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1419 1420 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1421 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1422 1423 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1424 .add(*Src0) 1425 .addImm(SubRegIdx0) 1426 .add(*Src1) 1427 .addImm(SubRegIdx1); 1428 1429 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1430 .addReg(SrcReg, RegState::Kill); 1431 1432 AddressRegs Regs = getRegs(Opcode, *TII); 1433 1434 if (Regs.VAddr) 1435 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1436 1437 unsigned JoinedFormat = 1438 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1439 1440 // It shouldn't be possible to get this far if the two instructions 1441 // don't have a single memoperand, because MachineInstr::mayAlias() 1442 // will return true if this is the case. 1443 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1444 1445 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1446 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1447 1448 MachineInstr *New = 1449 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1450 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1451 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1452 .addImm(JoinedFormat) // format 1453 .addImm(CI.GLC) // glc 1454 .addImm(CI.SLC) // slc 1455 .addImm(0) // tfe 1456 .addImm(CI.DLC) // dlc 1457 .addImm(0) // swz 1458 .addMemOperand( 1459 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1460 1461 moveInstsAfter(MIB, InstsToMove); 1462 1463 CI.I->eraseFromParent(); 1464 Paired.I->eraseFromParent(); 1465 return New; 1466 } 1467 1468 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1469 const CombineInfo &Paired) { 1470 const unsigned Width = CI.Width + Paired.Width; 1471 1472 switch (CI.InstClass) { 1473 default: 1474 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1475 // FIXME: Handle d16 correctly 1476 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1477 Width); 1478 case TBUFFER_LOAD: 1479 case TBUFFER_STORE: 1480 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1481 Width); 1482 1483 case UNKNOWN: 1484 llvm_unreachable("Unknown instruction class"); 1485 case S_BUFFER_LOAD_IMM: 1486 switch (Width) { 1487 default: 1488 return 0; 1489 case 2: 1490 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1491 case 4: 1492 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1493 } 1494 case MIMG: 1495 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1496 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1497 } 1498 } 1499 1500 std::pair<unsigned, unsigned> 1501 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1502 1503 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1504 return std::make_pair(0, 0); 1505 1506 bool ReverseOrder; 1507 if (CI.InstClass == MIMG) { 1508 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1509 "No overlaps"); 1510 ReverseOrder = CI.DMask > Paired.DMask; 1511 } else 1512 ReverseOrder = CI.Offset > Paired.Offset; 1513 1514 static const unsigned Idxs[4][4] = { 1515 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1516 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1517 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1518 {AMDGPU::sub3, 0, 0, 0}, 1519 }; 1520 unsigned Idx0; 1521 unsigned Idx1; 1522 1523 assert(CI.Width >= 1 && CI.Width <= 3); 1524 assert(Paired.Width >= 1 && Paired.Width <= 3); 1525 1526 if (ReverseOrder) { 1527 Idx1 = Idxs[0][Paired.Width - 1]; 1528 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1529 } else { 1530 Idx0 = Idxs[0][CI.Width - 1]; 1531 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1532 } 1533 1534 return std::make_pair(Idx0, Idx1); 1535 } 1536 1537 const TargetRegisterClass * 1538 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1539 const CombineInfo &Paired) { 1540 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1541 switch (CI.Width + Paired.Width) { 1542 default: 1543 return nullptr; 1544 case 2: 1545 return &AMDGPU::SReg_64_XEXECRegClass; 1546 case 4: 1547 return &AMDGPU::SGPR_128RegClass; 1548 case 8: 1549 return &AMDGPU::SGPR_256RegClass; 1550 case 16: 1551 return &AMDGPU::SGPR_512RegClass; 1552 } 1553 } else { 1554 switch (CI.Width + Paired.Width) { 1555 default: 1556 return nullptr; 1557 case 2: 1558 return &AMDGPU::VReg_64RegClass; 1559 case 3: 1560 return &AMDGPU::VReg_96RegClass; 1561 case 4: 1562 return &AMDGPU::VReg_128RegClass; 1563 } 1564 } 1565 } 1566 1567 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1568 CombineInfo &CI, CombineInfo &Paired, 1569 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1570 MachineBasicBlock *MBB = CI.I->getParent(); 1571 DebugLoc DL = CI.I->getDebugLoc(); 1572 1573 const unsigned Opcode = getNewOpcode(CI, Paired); 1574 1575 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1576 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1577 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1578 1579 // Copy to the new source register. 1580 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1581 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1582 1583 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1584 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1585 1586 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1587 .add(*Src0) 1588 .addImm(SubRegIdx0) 1589 .add(*Src1) 1590 .addImm(SubRegIdx1); 1591 1592 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1593 .addReg(SrcReg, RegState::Kill); 1594 1595 AddressRegs Regs = getRegs(Opcode, *TII); 1596 1597 if (Regs.VAddr) 1598 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1599 1600 1601 // It shouldn't be possible to get this far if the two instructions 1602 // don't have a single memoperand, because MachineInstr::mayAlias() 1603 // will return true if this is the case. 1604 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1605 1606 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1607 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1608 1609 MachineInstr *New = 1610 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1611 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1612 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1613 .addImm(CI.GLC) // glc 1614 .addImm(CI.SLC) // slc 1615 .addImm(0) // tfe 1616 .addImm(CI.DLC) // dlc 1617 .addImm(0) // swz 1618 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1619 1620 moveInstsAfter(MIB, InstsToMove); 1621 1622 CI.I->eraseFromParent(); 1623 Paired.I->eraseFromParent(); 1624 return New; 1625 } 1626 1627 MachineOperand 1628 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1629 APInt V(32, Val, true); 1630 if (TII->isInlineConstant(V)) 1631 return MachineOperand::CreateImm(Val); 1632 1633 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1634 MachineInstr *Mov = 1635 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1636 TII->get(AMDGPU::S_MOV_B32), Reg) 1637 .addImm(Val); 1638 (void)Mov; 1639 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1640 return MachineOperand::CreateReg(Reg, false); 1641 } 1642 1643 // Compute base address using Addr and return the final register. 1644 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1645 const MemAddress &Addr) const { 1646 MachineBasicBlock *MBB = MI.getParent(); 1647 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1648 DebugLoc DL = MI.getDebugLoc(); 1649 1650 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1651 Addr.Base.LoSubReg) && 1652 "Expected 32-bit Base-Register-Low!!"); 1653 1654 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1655 Addr.Base.HiSubReg) && 1656 "Expected 32-bit Base-Register-Hi!!"); 1657 1658 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1659 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1660 MachineOperand OffsetHi = 1661 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1662 1663 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1664 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1665 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1666 1667 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1668 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1669 MachineInstr *LoHalf = 1670 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) 1671 .addReg(CarryReg, RegState::Define) 1672 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1673 .add(OffsetLo) 1674 .addImm(0); // clamp bit 1675 (void)LoHalf; 1676 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1677 1678 MachineInstr *HiHalf = 1679 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1680 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1681 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1682 .add(OffsetHi) 1683 .addReg(CarryReg, RegState::Kill) 1684 .addImm(0); // clamp bit 1685 (void)HiHalf; 1686 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1687 1688 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1689 MachineInstr *FullBase = 1690 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1691 .addReg(DestSub0) 1692 .addImm(AMDGPU::sub0) 1693 .addReg(DestSub1) 1694 .addImm(AMDGPU::sub1); 1695 (void)FullBase; 1696 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1697 1698 return FullDestReg; 1699 } 1700 1701 // Update base and offset with the NewBase and NewOffset in MI. 1702 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1703 Register NewBase, 1704 int32_t NewOffset) const { 1705 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1706 Base->setReg(NewBase); 1707 Base->setIsKill(false); 1708 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1709 } 1710 1711 Optional<int32_t> 1712 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1713 if (Op.isImm()) 1714 return Op.getImm(); 1715 1716 if (!Op.isReg()) 1717 return None; 1718 1719 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1720 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1721 !Def->getOperand(1).isImm()) 1722 return None; 1723 1724 return Def->getOperand(1).getImm(); 1725 } 1726 1727 // Analyze Base and extracts: 1728 // - 32bit base registers, subregisters 1729 // - 64bit constant offset 1730 // Expecting base computation as: 1731 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1732 // %LO:vgpr_32, %c:sreg_64_xexec = 1733 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1734 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1735 // %Base:vreg_64 = 1736 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1737 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1738 MemAddress &Addr) const { 1739 if (!Base.isReg()) 1740 return; 1741 1742 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1743 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1744 || Def->getNumOperands() != 5) 1745 return; 1746 1747 MachineOperand BaseLo = Def->getOperand(1); 1748 MachineOperand BaseHi = Def->getOperand(3); 1749 if (!BaseLo.isReg() || !BaseHi.isReg()) 1750 return; 1751 1752 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1753 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1754 1755 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || 1756 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1757 return; 1758 1759 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1760 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1761 1762 auto Offset0P = extractConstOffset(*Src0); 1763 if (Offset0P) 1764 BaseLo = *Src1; 1765 else { 1766 if (!(Offset0P = extractConstOffset(*Src1))) 1767 return; 1768 BaseLo = *Src0; 1769 } 1770 1771 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1772 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1773 1774 if (Src0->isImm()) 1775 std::swap(Src0, Src1); 1776 1777 if (!Src1->isImm()) 1778 return; 1779 1780 uint64_t Offset1 = Src1->getImm(); 1781 BaseHi = *Src0; 1782 1783 Addr.Base.LoReg = BaseLo.getReg(); 1784 Addr.Base.HiReg = BaseHi.getReg(); 1785 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1786 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1787 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1788 } 1789 1790 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1791 MachineInstr &MI, 1792 MemInfoMap &Visited, 1793 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1794 1795 if (!(MI.mayLoad() ^ MI.mayStore())) 1796 return false; 1797 1798 // TODO: Support flat and scratch. 1799 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1800 return false; 1801 1802 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1803 return false; 1804 1805 if (AnchorList.count(&MI)) 1806 return false; 1807 1808 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1809 1810 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1811 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1812 return false; 1813 } 1814 1815 // Step1: Find the base-registers and a 64bit constant offset. 1816 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1817 MemAddress MAddr; 1818 if (Visited.find(&MI) == Visited.end()) { 1819 processBaseWithConstOffset(Base, MAddr); 1820 Visited[&MI] = MAddr; 1821 } else 1822 MAddr = Visited[&MI]; 1823 1824 if (MAddr.Offset == 0) { 1825 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1826 " constant offsets that can be promoted.\n";); 1827 return false; 1828 } 1829 1830 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1831 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1832 1833 // Step2: Traverse through MI's basic block and find an anchor(that has the 1834 // same base-registers) with the highest 13bit distance from MI's offset. 1835 // E.g. (64bit loads) 1836 // bb: 1837 // addr1 = &a + 4096; load1 = load(addr1, 0) 1838 // addr2 = &a + 6144; load2 = load(addr2, 0) 1839 // addr3 = &a + 8192; load3 = load(addr3, 0) 1840 // addr4 = &a + 10240; load4 = load(addr4, 0) 1841 // addr5 = &a + 12288; load5 = load(addr5, 0) 1842 // 1843 // Starting from the first load, the optimization will try to find a new base 1844 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1845 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1846 // as the new-base(anchor) because of the maximum distance which can 1847 // accomodate more intermediate bases presumeably. 1848 // 1849 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1850 // (&a + 8192) for load1, load2, load4. 1851 // addr = &a + 8192 1852 // load1 = load(addr, -4096) 1853 // load2 = load(addr, -2048) 1854 // load3 = load(addr, 0) 1855 // load4 = load(addr, 2048) 1856 // addr5 = &a + 12288; load5 = load(addr5, 0) 1857 // 1858 MachineInstr *AnchorInst = nullptr; 1859 MemAddress AnchorAddr; 1860 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1861 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1862 1863 MachineBasicBlock *MBB = MI.getParent(); 1864 MachineBasicBlock::iterator E = MBB->end(); 1865 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1866 ++MBBI; 1867 const SITargetLowering *TLI = 1868 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1869 1870 for ( ; MBBI != E; ++MBBI) { 1871 MachineInstr &MINext = *MBBI; 1872 // TODO: Support finding an anchor(with same base) from store addresses or 1873 // any other load addresses where the opcodes are different. 1874 if (MINext.getOpcode() != MI.getOpcode() || 1875 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1876 continue; 1877 1878 const MachineOperand &BaseNext = 1879 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1880 MemAddress MAddrNext; 1881 if (Visited.find(&MINext) == Visited.end()) { 1882 processBaseWithConstOffset(BaseNext, MAddrNext); 1883 Visited[&MINext] = MAddrNext; 1884 } else 1885 MAddrNext = Visited[&MINext]; 1886 1887 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1888 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1889 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1890 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1891 continue; 1892 1893 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1894 1895 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1896 TargetLoweringBase::AddrMode AM; 1897 AM.HasBaseReg = true; 1898 AM.BaseOffs = Dist; 1899 if (TLI->isLegalGlobalAddressingMode(AM) && 1900 (uint32_t)std::abs(Dist) > MaxDist) { 1901 MaxDist = std::abs(Dist); 1902 1903 AnchorAddr = MAddrNext; 1904 AnchorInst = &MINext; 1905 } 1906 } 1907 1908 if (AnchorInst) { 1909 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1910 AnchorInst->dump()); 1911 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1912 << AnchorAddr.Offset << "\n\n"); 1913 1914 // Instead of moving up, just re-compute anchor-instruction's base address. 1915 Register Base = computeBase(MI, AnchorAddr); 1916 1917 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1918 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1919 1920 for (auto P : InstsWCommonBase) { 1921 TargetLoweringBase::AddrMode AM; 1922 AM.HasBaseReg = true; 1923 AM.BaseOffs = P.second - AnchorAddr.Offset; 1924 1925 if (TLI->isLegalGlobalAddressingMode(AM)) { 1926 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1927 dbgs() << ")"; P.first->dump()); 1928 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1929 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1930 } 1931 } 1932 AnchorList.insert(AnchorInst); 1933 return true; 1934 } 1935 1936 return false; 1937 } 1938 1939 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1940 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1941 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1942 if (AddrList.front().InstClass == CI.InstClass && 1943 AddrList.front().hasSameBaseAddress(*CI.I)) { 1944 AddrList.emplace_back(CI); 1945 return; 1946 } 1947 } 1948 1949 // Base address not found, so add a new list. 1950 MergeableInsts.emplace_back(1, CI); 1951 } 1952 1953 std::pair<MachineBasicBlock::iterator, bool> 1954 SILoadStoreOptimizer::collectMergeableInsts( 1955 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1956 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1957 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1958 bool Modified = false; 1959 1960 // Sort potential mergeable instructions into lists. One list per base address. 1961 unsigned Order = 0; 1962 MachineBasicBlock::iterator BlockI = Begin; 1963 for (; BlockI != End; ++BlockI) { 1964 MachineInstr &MI = *BlockI; 1965 1966 // We run this before checking if an address is mergeable, because it can produce 1967 // better code even if the instructions aren't mergeable. 1968 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1969 Modified = true; 1970 1971 // Don't combine if volatile. We also won't be able to merge across this, so 1972 // break the search. We can look after this barrier for separate merges. 1973 if (MI.hasOrderedMemoryRef()) { 1974 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1975 1976 // Search will resume after this instruction in a separate merge list. 1977 ++BlockI; 1978 break; 1979 } 1980 1981 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1982 if (InstClass == UNKNOWN) 1983 continue; 1984 1985 CombineInfo CI; 1986 CI.setMI(MI, *TII, *STM); 1987 CI.Order = Order++; 1988 1989 if (!CI.hasMergeableAddress(*MRI)) 1990 continue; 1991 1992 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 1993 1994 addInstToMergeableList(CI, MergeableInsts); 1995 } 1996 1997 // At this point we have lists of Mergeable instructions. 1998 // 1999 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2000 // list try to find an instruction that can be merged with I. If an instruction 2001 // is found, it is stored in the Paired field. If no instructions are found, then 2002 // the CombineInfo object is deleted from the list. 2003 2004 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2005 E = MergeableInsts.end(); I != E;) { 2006 2007 std::list<CombineInfo> &MergeList = *I; 2008 if (MergeList.size() <= 1) { 2009 // This means we have found only one instruction with a given address 2010 // that can be merged, and we need at least 2 instructions to do a merge, 2011 // so this list can be discarded. 2012 I = MergeableInsts.erase(I); 2013 continue; 2014 } 2015 2016 // Sort the lists by offsets, this way mergeable instructions will be 2017 // adjacent to each other in the list, which will make it easier to find 2018 // matches. 2019 MergeList.sort( 2020 [] (const CombineInfo &A, CombineInfo &B) { 2021 return A.Offset < B.Offset; 2022 }); 2023 ++I; 2024 } 2025 2026 return std::make_pair(BlockI, Modified); 2027 } 2028 2029 // Scan through looking for adjacent LDS operations with constant offsets from 2030 // the same base register. We rely on the scheduler to do the hard work of 2031 // clustering nearby loads, and assume these are all adjacent. 2032 bool SILoadStoreOptimizer::optimizeBlock( 2033 std::list<std::list<CombineInfo> > &MergeableInsts) { 2034 bool Modified = false; 2035 2036 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2037 E = MergeableInsts.end(); I != E;) { 2038 std::list<CombineInfo> &MergeList = *I; 2039 2040 bool OptimizeListAgain = false; 2041 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2042 // We weren't able to make any changes, so delete the list so we don't 2043 // process the same instructions the next time we try to optimize this 2044 // block. 2045 I = MergeableInsts.erase(I); 2046 continue; 2047 } 2048 2049 Modified = true; 2050 2051 // We made changes, but also determined that there were no more optimization 2052 // opportunities, so we don't need to reprocess the list 2053 if (!OptimizeListAgain) { 2054 I = MergeableInsts.erase(I); 2055 continue; 2056 } 2057 OptimizeAgain = true; 2058 } 2059 return Modified; 2060 } 2061 2062 bool 2063 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2064 std::list<CombineInfo> &MergeList, 2065 bool &OptimizeListAgain) { 2066 if (MergeList.empty()) 2067 return false; 2068 2069 bool Modified = false; 2070 2071 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2072 Next = std::next(I)) { 2073 2074 auto First = I; 2075 auto Second = Next; 2076 2077 if ((*First).Order > (*Second).Order) 2078 std::swap(First, Second); 2079 CombineInfo &CI = *First; 2080 CombineInfo &Paired = *Second; 2081 2082 SmallVector<MachineInstr *, 8> InstsToMove; 2083 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2084 ++I; 2085 continue; 2086 } 2087 2088 Modified = true; 2089 2090 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2091 2092 switch (CI.InstClass) { 2093 default: 2094 llvm_unreachable("unknown InstClass"); 2095 break; 2096 case DS_READ: { 2097 MachineBasicBlock::iterator NewMI = 2098 mergeRead2Pair(CI, Paired, InstsToMove); 2099 CI.setMI(NewMI, *TII, *STM); 2100 break; 2101 } 2102 case DS_WRITE: { 2103 MachineBasicBlock::iterator NewMI = 2104 mergeWrite2Pair(CI, Paired, InstsToMove); 2105 CI.setMI(NewMI, *TII, *STM); 2106 break; 2107 } 2108 case S_BUFFER_LOAD_IMM: { 2109 MachineBasicBlock::iterator NewMI = 2110 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2111 CI.setMI(NewMI, *TII, *STM); 2112 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2113 break; 2114 } 2115 case BUFFER_LOAD: { 2116 MachineBasicBlock::iterator NewMI = 2117 mergeBufferLoadPair(CI, Paired, InstsToMove); 2118 CI.setMI(NewMI, *TII, *STM); 2119 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2120 break; 2121 } 2122 case BUFFER_STORE: { 2123 MachineBasicBlock::iterator NewMI = 2124 mergeBufferStorePair(CI, Paired, InstsToMove); 2125 CI.setMI(NewMI, *TII, *STM); 2126 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2127 break; 2128 } 2129 case MIMG: { 2130 MachineBasicBlock::iterator NewMI = 2131 mergeImagePair(CI, Paired, InstsToMove); 2132 CI.setMI(NewMI, *TII, *STM); 2133 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2134 break; 2135 } 2136 case TBUFFER_LOAD: { 2137 MachineBasicBlock::iterator NewMI = 2138 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2139 CI.setMI(NewMI, *TII, *STM); 2140 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2141 break; 2142 } 2143 case TBUFFER_STORE: { 2144 MachineBasicBlock::iterator NewMI = 2145 mergeTBufferStorePair(CI, Paired, InstsToMove); 2146 CI.setMI(NewMI, *TII, *STM); 2147 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2148 break; 2149 } 2150 } 2151 CI.Order = Paired.Order; 2152 if (I == Second) 2153 I = Next; 2154 2155 MergeList.erase(Second); 2156 } 2157 2158 return Modified; 2159 } 2160 2161 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2162 if (skipFunction(MF.getFunction())) 2163 return false; 2164 2165 STM = &MF.getSubtarget<GCNSubtarget>(); 2166 if (!STM->loadStoreOptEnabled()) 2167 return false; 2168 2169 TII = STM->getInstrInfo(); 2170 TRI = &TII->getRegisterInfo(); 2171 2172 MRI = &MF.getRegInfo(); 2173 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2174 2175 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2176 2177 bool Modified = false; 2178 2179 // Contains the list of instructions for which constant offsets are being 2180 // promoted to the IMM. This is tracked for an entire block at time. 2181 SmallPtrSet<MachineInstr *, 4> AnchorList; 2182 MemInfoMap Visited; 2183 2184 for (MachineBasicBlock &MBB : MF) { 2185 MachineBasicBlock::iterator SectionEnd; 2186 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2187 I = SectionEnd) { 2188 bool CollectModified; 2189 std::list<std::list<CombineInfo>> MergeableInsts; 2190 2191 // First pass: Collect list of all instructions we know how to merge in a 2192 // subset of the block. 2193 std::tie(SectionEnd, CollectModified) = 2194 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2195 2196 Modified |= CollectModified; 2197 2198 do { 2199 OptimizeAgain = false; 2200 Modified |= optimizeBlock(MergeableInsts); 2201 } while (OptimizeAgain); 2202 } 2203 2204 Visited.clear(); 2205 AnchorList.clear(); 2206 } 2207 2208 return Modified; 2209 } 2210