1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool UseST64; 109 int AddrIdx[MaxAddressRegs]; 110 const MachineOperand *AddrReg[MaxAddressRegs]; 111 unsigned NumAddresses; 112 unsigned Order; 113 114 bool hasSameBaseAddress(const MachineInstr &MI) { 115 for (unsigned i = 0; i < NumAddresses; i++) { 116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 117 118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 120 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 121 return false; 122 } 123 continue; 124 } 125 126 // Check same base pointer. Be careful of subregisters, which can occur 127 // with vectors of pointers. 128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 130 return false; 131 } 132 } 133 return true; 134 } 135 136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 137 for (unsigned i = 0; i < NumAddresses; ++i) { 138 const MachineOperand *AddrOp = AddrReg[i]; 139 // Immediates are always OK. 140 if (AddrOp->isImm()) 141 continue; 142 143 // Don't try to merge addresses that aren't either immediates or registers. 144 // TODO: Should be possible to merge FrameIndexes and maybe some other 145 // non-register 146 if (!AddrOp->isReg()) 147 return false; 148 149 // TODO: We should be able to merge physical reg addresses. 150 if (AddrOp->getReg().isPhysical()) 151 return false; 152 153 // If an address has only one use then there will be on other 154 // instructions with the same address, so we can't merge this one. 155 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 156 return false; 157 } 158 return true; 159 } 160 161 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 162 const GCNSubtarget &STM); 163 }; 164 165 struct BaseRegisters { 166 Register LoReg; 167 Register HiReg; 168 169 unsigned LoSubReg = 0; 170 unsigned HiSubReg = 0; 171 }; 172 173 struct MemAddress { 174 BaseRegisters Base; 175 int64_t Offset = 0; 176 }; 177 178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 179 180 private: 181 const GCNSubtarget *STM = nullptr; 182 const SIInstrInfo *TII = nullptr; 183 const SIRegisterInfo *TRI = nullptr; 184 MachineRegisterInfo *MRI = nullptr; 185 AliasAnalysis *AA = nullptr; 186 bool OptimizeAgain; 187 188 static bool dmasksCanBeCombined(const CombineInfo &CI, 189 const SIInstrInfo &TII, 190 const CombineInfo &Paired); 191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 192 CombineInfo &Paired, bool Modify = false); 193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 194 const CombineInfo &Paired); 195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 197 const CombineInfo &Paired); 198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 199 const CombineInfo &Paired); 200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 201 202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 203 SmallVectorImpl<MachineInstr *> &InstsToMove); 204 205 unsigned read2Opcode(unsigned EltSize) const; 206 unsigned read2ST64Opcode(unsigned EltSize) const; 207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 208 CombineInfo &Paired, 209 const SmallVectorImpl<MachineInstr *> &InstsToMove); 210 211 unsigned write2Opcode(unsigned EltSize) const; 212 unsigned write2ST64Opcode(unsigned EltSize) const; 213 MachineBasicBlock::iterator 214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 215 const SmallVectorImpl<MachineInstr *> &InstsToMove); 216 MachineBasicBlock::iterator 217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 235 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 236 int32_t NewOffset) const; 237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 241 /// Promotes constant offset to the immediate by adjusting the base. It 242 /// tries to use a base from the nearby instructions that allows it to have 243 /// a 13bit constant offset which gets promoted to the immediate. 244 bool promoteConstantOffsetToImm(MachineInstr &CI, 245 MemInfoMap &Visited, 246 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 247 void addInstToMergeableList(const CombineInfo &CI, 248 std::list<std::list<CombineInfo> > &MergeableInsts) const; 249 250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 253 std::list<std::list<CombineInfo>> &MergeableInsts) const; 254 255 public: 256 static char ID; 257 258 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 263 bool &OptimizeListAgain); 264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 265 266 bool runOnMachineFunction(MachineFunction &MF) override; 267 268 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.setPreservesCFG(); 272 AU.addRequired<AAResultsWrapperPass>(); 273 274 MachineFunctionPass::getAnalysisUsage(AU); 275 } 276 277 MachineFunctionProperties getRequiredProperties() const override { 278 return MachineFunctionProperties() 279 .set(MachineFunctionProperties::Property::IsSSA); 280 } 281 }; 282 283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 284 const unsigned Opc = MI.getOpcode(); 285 286 if (TII.isMUBUF(Opc)) { 287 // FIXME: Handle d16 correctly 288 return AMDGPU::getMUBUFElements(Opc); 289 } 290 if (TII.isMIMG(MI)) { 291 uint64_t DMaskImm = 292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 293 return countPopulation(DMaskImm); 294 } 295 if (TII.isMTBUF(Opc)) { 296 return AMDGPU::getMTBUFElements(Opc); 297 } 298 299 switch (Opc) { 300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 301 return 1; 302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 303 return 2; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 305 return 4; 306 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 307 return 8; 308 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 309 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 310 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 311 case AMDGPU::DS_WRITE_B32_gfx9: 312 return 1; 313 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 314 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 315 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 316 case AMDGPU::DS_WRITE_B64_gfx9: 317 return 2; 318 default: 319 return 0; 320 } 321 } 322 323 /// Maps instruction opcode to enum InstClassEnum. 324 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 325 switch (Opc) { 326 default: 327 if (TII.isMUBUF(Opc)) { 328 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 329 default: 330 return UNKNOWN; 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 335 return BUFFER_LOAD; 336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 340 return BUFFER_STORE; 341 } 342 } 343 if (TII.isMIMG(Opc)) { 344 // Ignore instructions encoded without vaddr. 345 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 346 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 347 return UNKNOWN; 348 // Ignore BVH instructions 349 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 350 return UNKNOWN; 351 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 352 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 353 TII.isGather4(Opc)) 354 return UNKNOWN; 355 return MIMG; 356 } 357 if (TII.isMTBUF(Opc)) { 358 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 359 default: 360 return UNKNOWN; 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 365 return TBUFFER_LOAD; 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 370 return TBUFFER_STORE; 371 } 372 } 373 return UNKNOWN; 374 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 377 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 378 return S_BUFFER_LOAD_IMM; 379 case AMDGPU::DS_READ_B32: 380 case AMDGPU::DS_READ_B32_gfx9: 381 case AMDGPU::DS_READ_B64: 382 case AMDGPU::DS_READ_B64_gfx9: 383 return DS_READ; 384 case AMDGPU::DS_WRITE_B32: 385 case AMDGPU::DS_WRITE_B32_gfx9: 386 case AMDGPU::DS_WRITE_B64: 387 case AMDGPU::DS_WRITE_B64_gfx9: 388 return DS_WRITE; 389 } 390 } 391 392 /// Determines instruction subclass from opcode. Only instructions 393 /// of the same subclass can be merged together. 394 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 395 switch (Opc) { 396 default: 397 if (TII.isMUBUF(Opc)) 398 return AMDGPU::getMUBUFBaseOpcode(Opc); 399 if (TII.isMIMG(Opc)) { 400 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 401 assert(Info); 402 return Info->BaseOpcode; 403 } 404 if (TII.isMTBUF(Opc)) 405 return AMDGPU::getMTBUFBaseOpcode(Opc); 406 return -1; 407 case AMDGPU::DS_READ_B32: 408 case AMDGPU::DS_READ_B32_gfx9: 409 case AMDGPU::DS_READ_B64: 410 case AMDGPU::DS_READ_B64_gfx9: 411 case AMDGPU::DS_WRITE_B32: 412 case AMDGPU::DS_WRITE_B32_gfx9: 413 case AMDGPU::DS_WRITE_B64: 414 case AMDGPU::DS_WRITE_B64_gfx9: 415 return Opc; 416 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 417 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 418 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 420 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 421 } 422 } 423 424 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 425 AddressRegs Result; 426 427 if (TII.isMUBUF(Opc)) { 428 if (AMDGPU::getMUBUFHasVAddr(Opc)) 429 Result.VAddr = true; 430 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 431 Result.SRsrc = true; 432 if (AMDGPU::getMUBUFHasSoffset(Opc)) 433 Result.SOffset = true; 434 435 return Result; 436 } 437 438 if (TII.isMIMG(Opc)) { 439 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 440 if (VAddr0Idx >= 0) { 441 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 442 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 443 } else { 444 Result.VAddr = true; 445 } 446 Result.SRsrc = true; 447 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 448 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 449 Result.SSamp = true; 450 451 return Result; 452 } 453 if (TII.isMTBUF(Opc)) { 454 if (AMDGPU::getMTBUFHasVAddr(Opc)) 455 Result.VAddr = true; 456 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 457 Result.SRsrc = true; 458 if (AMDGPU::getMTBUFHasSoffset(Opc)) 459 Result.SOffset = true; 460 461 return Result; 462 } 463 464 switch (Opc) { 465 default: 466 return Result; 467 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 468 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 469 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 470 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 471 Result.SBase = true; 472 return Result; 473 case AMDGPU::DS_READ_B32: 474 case AMDGPU::DS_READ_B64: 475 case AMDGPU::DS_READ_B32_gfx9: 476 case AMDGPU::DS_READ_B64_gfx9: 477 case AMDGPU::DS_WRITE_B32: 478 case AMDGPU::DS_WRITE_B64: 479 case AMDGPU::DS_WRITE_B32_gfx9: 480 case AMDGPU::DS_WRITE_B64_gfx9: 481 Result.Addr = true; 482 return Result; 483 } 484 } 485 486 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 487 const SIInstrInfo &TII, 488 const GCNSubtarget &STM) { 489 I = MI; 490 unsigned Opc = MI->getOpcode(); 491 InstClass = getInstClass(Opc, TII); 492 493 if (InstClass == UNKNOWN) 494 return; 495 496 switch (InstClass) { 497 case DS_READ: 498 EltSize = 499 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 500 : 4; 501 break; 502 case DS_WRITE: 503 EltSize = 504 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 505 : 4; 506 break; 507 case S_BUFFER_LOAD_IMM: 508 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 509 break; 510 default: 511 EltSize = 4; 512 break; 513 } 514 515 if (InstClass == MIMG) { 516 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 517 // Offset is not considered for MIMG instructions. 518 Offset = 0; 519 } else { 520 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 521 Offset = I->getOperand(OffsetIdx).getImm(); 522 } 523 524 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 525 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 526 527 Width = getOpcodeWidth(*I, TII); 528 529 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 530 Offset &= 0xffff; 531 } else if (InstClass != MIMG) { 532 CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 533 } 534 535 AddressRegs Regs = getRegs(Opc, TII); 536 537 NumAddresses = 0; 538 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 539 AddrIdx[NumAddresses++] = 540 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 541 if (Regs.Addr) 542 AddrIdx[NumAddresses++] = 543 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 544 if (Regs.SBase) 545 AddrIdx[NumAddresses++] = 546 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 547 if (Regs.SRsrc) 548 AddrIdx[NumAddresses++] = 549 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 550 if (Regs.SOffset) 551 AddrIdx[NumAddresses++] = 552 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 553 if (Regs.VAddr) 554 AddrIdx[NumAddresses++] = 555 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 556 if (Regs.SSamp) 557 AddrIdx[NumAddresses++] = 558 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 559 assert(NumAddresses <= MaxAddressRegs); 560 561 for (unsigned J = 0; J < NumAddresses; J++) 562 AddrReg[J] = &I->getOperand(AddrIdx[J]); 563 } 564 565 } // end anonymous namespace. 566 567 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 568 "SI Load Store Optimizer", false, false) 569 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 570 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 571 false, false) 572 573 char SILoadStoreOptimizer::ID = 0; 574 575 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 576 577 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 578 return new SILoadStoreOptimizer(); 579 } 580 581 static void moveInstsAfter(MachineBasicBlock::iterator I, 582 ArrayRef<MachineInstr *> InstsToMove) { 583 MachineBasicBlock *MBB = I->getParent(); 584 ++I; 585 for (MachineInstr *MI : InstsToMove) { 586 MI->removeFromParent(); 587 MBB->insert(I, MI); 588 } 589 } 590 591 static void addDefsUsesToList(const MachineInstr &MI, 592 DenseSet<Register> &RegDefs, 593 DenseSet<Register> &PhysRegUses) { 594 for (const MachineOperand &Op : MI.operands()) { 595 if (Op.isReg()) { 596 if (Op.isDef()) 597 RegDefs.insert(Op.getReg()); 598 else if (Op.readsReg() && Op.getReg().isPhysical()) 599 PhysRegUses.insert(Op.getReg()); 600 } 601 } 602 } 603 604 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 605 MachineBasicBlock::iterator B, 606 AliasAnalysis *AA) { 607 // RAW or WAR - cannot reorder 608 // WAW - cannot reorder 609 // RAR - safe to reorder 610 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 611 } 612 613 // Add MI and its defs to the lists if MI reads one of the defs that are 614 // already in the list. Returns true in that case. 615 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 616 DenseSet<Register> &PhysRegUses, 617 SmallVectorImpl<MachineInstr *> &Insts) { 618 for (MachineOperand &Use : MI.operands()) { 619 // If one of the defs is read, then there is a use of Def between I and the 620 // instruction that I will potentially be merged with. We will need to move 621 // this instruction after the merged instructions. 622 // 623 // Similarly, if there is a def which is read by an instruction that is to 624 // be moved for merging, then we need to move the def-instruction as well. 625 // This can only happen for physical registers such as M0; virtual 626 // registers are in SSA form. 627 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 628 (Use.isDef() && RegDefs.count(Use.getReg())) || 629 (Use.isDef() && Use.getReg().isPhysical() && 630 PhysRegUses.count(Use.getReg())))) { 631 Insts.push_back(&MI); 632 addDefsUsesToList(MI, RegDefs, PhysRegUses); 633 return true; 634 } 635 } 636 637 return false; 638 } 639 640 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 641 ArrayRef<MachineInstr *> InstsToMove, 642 AliasAnalysis *AA) { 643 assert(MemOp.mayLoadOrStore()); 644 645 for (MachineInstr *InstToMove : InstsToMove) { 646 if (!InstToMove->mayLoadOrStore()) 647 continue; 648 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 649 return false; 650 } 651 return true; 652 } 653 654 // This function assumes that \p A and \p B have are identical except for 655 // size and offset, and they reference adjacent memory. 656 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 657 const MachineMemOperand *A, 658 const MachineMemOperand *B) { 659 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 660 unsigned Size = A->getSize() + B->getSize(); 661 // This function adds the offset parameter to the existing offset for A, 662 // so we pass 0 here as the offset and then manually set it to the correct 663 // value after the call. 664 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 665 MMO->setOffset(MinOffset); 666 return MMO; 667 } 668 669 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 670 const SIInstrInfo &TII, 671 const CombineInfo &Paired) { 672 assert(CI.InstClass == MIMG); 673 674 // Ignore instructions with tfe/lwe set. 675 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 676 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 677 678 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 679 return false; 680 681 // Check other optional immediate operands for equality. 682 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 683 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 684 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 685 686 for (auto op : OperandsToMatch) { 687 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 688 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 689 return false; 690 if (Idx != -1 && 691 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 692 return false; 693 } 694 695 // Check DMask for overlaps. 696 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 697 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 698 699 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 700 if ((1u << AllowedBitsForMin) <= MinMask) 701 return false; 702 703 return true; 704 } 705 706 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 707 unsigned ComponentCount, 708 const GCNSubtarget &STI) { 709 if (ComponentCount > 4) 710 return 0; 711 712 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 713 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 714 if (!OldFormatInfo) 715 return 0; 716 717 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 718 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 719 ComponentCount, 720 OldFormatInfo->NumFormat, STI); 721 722 if (!NewFormatInfo) 723 return 0; 724 725 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 726 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 727 728 return NewFormatInfo->Format; 729 } 730 731 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 732 // highest power of two. Note that the result is well defined for all inputs 733 // including corner cases like: 734 // - if Lo == Hi, return that value 735 // - if Lo == 0, return 0 (even though the "- 1" below underflows 736 // - if Lo > Hi, return 0 (as if the range wrapped around) 737 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 738 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 739 } 740 741 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 742 const GCNSubtarget &STI, 743 CombineInfo &Paired, 744 bool Modify) { 745 assert(CI.InstClass != MIMG); 746 747 // XXX - Would the same offset be OK? Is there any reason this would happen or 748 // be useful? 749 if (CI.Offset == Paired.Offset) 750 return false; 751 752 // This won't be valid if the offset isn't aligned. 753 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 754 return false; 755 756 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 757 758 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 759 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 760 if (!Info0) 761 return false; 762 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 763 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 764 if (!Info1) 765 return false; 766 767 if (Info0->BitsPerComp != Info1->BitsPerComp || 768 Info0->NumFormat != Info1->NumFormat) 769 return false; 770 771 // TODO: Should be possible to support more formats, but if format loads 772 // are not dword-aligned, the merged load might not be valid. 773 if (Info0->BitsPerComp != 32) 774 return false; 775 776 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 777 return false; 778 } 779 780 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 781 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 782 CI.UseST64 = false; 783 CI.BaseOff = 0; 784 785 // Handle all non-DS instructions. 786 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 787 return (EltOffset0 + CI.Width == EltOffset1 || 788 EltOffset1 + Paired.Width == EltOffset0) && 789 CI.CPol == Paired.CPol && 790 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 791 } 792 793 // If the offset in elements doesn't fit in 8-bits, we might be able to use 794 // the stride 64 versions. 795 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 796 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 797 if (Modify) { 798 CI.Offset = EltOffset0 / 64; 799 Paired.Offset = EltOffset1 / 64; 800 CI.UseST64 = true; 801 } 802 return true; 803 } 804 805 // Check if the new offsets fit in the reduced 8-bit range. 806 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 807 if (Modify) { 808 CI.Offset = EltOffset0; 809 Paired.Offset = EltOffset1; 810 } 811 return true; 812 } 813 814 // Try to shift base address to decrease offsets. 815 uint32_t Min = std::min(EltOffset0, EltOffset1); 816 uint32_t Max = std::max(EltOffset0, EltOffset1); 817 818 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 819 if (((Max - Min) & ~Mask) == 0) { 820 if (Modify) { 821 // From the range of values we could use for BaseOff, choose the one that 822 // is aligned to the highest power of two, to maximise the chance that 823 // the same offset can be reused for other load/store pairs. 824 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 825 // Copy the low bits of the offsets, so that when we adjust them by 826 // subtracting BaseOff they will be multiples of 64. 827 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 828 CI.BaseOff = BaseOff * CI.EltSize; 829 CI.Offset = (EltOffset0 - BaseOff) / 64; 830 Paired.Offset = (EltOffset1 - BaseOff) / 64; 831 CI.UseST64 = true; 832 } 833 return true; 834 } 835 836 if (isUInt<8>(Max - Min)) { 837 if (Modify) { 838 // From the range of values we could use for BaseOff, choose the one that 839 // is aligned to the highest power of two, to maximise the chance that 840 // the same offset can be reused for other load/store pairs. 841 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 842 CI.BaseOff = BaseOff * CI.EltSize; 843 CI.Offset = EltOffset0 - BaseOff; 844 Paired.Offset = EltOffset1 - BaseOff; 845 } 846 return true; 847 } 848 849 return false; 850 } 851 852 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 853 const CombineInfo &CI, 854 const CombineInfo &Paired) { 855 const unsigned Width = (CI.Width + Paired.Width); 856 switch (CI.InstClass) { 857 default: 858 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 859 case S_BUFFER_LOAD_IMM: 860 switch (Width) { 861 default: 862 return false; 863 case 2: 864 case 4: 865 case 8: 866 return true; 867 } 868 } 869 } 870 871 const TargetRegisterClass * 872 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 873 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 874 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 875 } 876 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 877 return TRI->getRegClassForReg(*MRI, Src->getReg()); 878 } 879 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 880 return TRI->getRegClassForReg(*MRI, Src->getReg()); 881 } 882 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 883 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 884 } 885 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 886 return TRI->getRegClassForReg(*MRI, Src->getReg()); 887 } 888 return nullptr; 889 } 890 891 /// This function assumes that CI comes before Paired in a basic block. 892 bool SILoadStoreOptimizer::checkAndPrepareMerge( 893 CombineInfo &CI, CombineInfo &Paired, 894 SmallVectorImpl<MachineInstr *> &InstsToMove) { 895 896 // Check both offsets (or masks for MIMG) can be combined and fit in the 897 // reduced range. 898 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 899 return false; 900 901 if (CI.InstClass != MIMG && 902 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 903 return false; 904 905 const unsigned Opc = CI.I->getOpcode(); 906 const InstClassEnum InstClass = getInstClass(Opc, *TII); 907 908 if (InstClass == UNKNOWN) { 909 return false; 910 } 911 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 912 913 // Do not merge VMEM buffer instructions with "swizzled" bit set. 914 int Swizzled = 915 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 916 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 917 return false; 918 919 DenseSet<Register> RegDefsToMove; 920 DenseSet<Register> PhysRegUsesToMove; 921 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 922 923 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); 924 bool IsAGPR = TRI->hasAGPRs(DataRC); 925 926 MachineBasicBlock::iterator E = std::next(Paired.I); 927 MachineBasicBlock::iterator MBBI = std::next(CI.I); 928 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 929 for (; MBBI != E; ++MBBI) { 930 931 if (MBBI == MBBE) { 932 // CombineInfo::Order is a hint on the instruction ordering within the 933 // basic block. This hint suggests that CI precedes Paired, which is 934 // true most of the time. However, moveInstsAfter() processing a 935 // previous list may have changed this order in a situation when it 936 // moves an instruction which exists in some other merge list. 937 // In this case it must be dependent. 938 return false; 939 } 940 941 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 942 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 943 // This is not a matching instruction, but we can keep looking as 944 // long as one of these conditions are met: 945 // 1. It is safe to move I down past MBBI. 946 // 2. It is safe to move MBBI down past the instruction that I will 947 // be merged into. 948 949 if (MBBI->hasUnmodeledSideEffects()) { 950 // We can't re-order this instruction with respect to other memory 951 // operations, so we fail both conditions mentioned above. 952 return false; 953 } 954 955 if (MBBI->mayLoadOrStore() && 956 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 957 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 958 // We fail condition #1, but we may still be able to satisfy condition 959 // #2. Add this instruction to the move list and then we will check 960 // if condition #2 holds once we have selected the matching instruction. 961 InstsToMove.push_back(&*MBBI); 962 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 963 continue; 964 } 965 966 // When we match I with another DS instruction we will be moving I down 967 // to the location of the matched instruction any uses of I will need to 968 // be moved down as well. 969 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 970 InstsToMove); 971 continue; 972 } 973 974 // Don't merge volatiles. 975 if (MBBI->hasOrderedMemoryRef()) 976 return false; 977 978 int Swizzled = 979 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 980 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 981 return false; 982 983 // Handle a case like 984 // DS_WRITE_B32 addr, v, idx0 985 // w = DS_READ_B32 addr, idx0 986 // DS_WRITE_B32 addr, f(w), idx1 987 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 988 // merging of the two writes. 989 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 990 InstsToMove)) 991 continue; 992 993 if (&*MBBI == &*Paired.I) { 994 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) 995 return false; 996 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 997 // operands. However we are reporting that ds_write2 shall have 998 // only VGPR data so that machine copy propagation does not 999 // create an illegal instruction with a VGPR and AGPR sources. 1000 // Consequenctially if we create such instruction the verifier 1001 // will complain. 1002 if (IsAGPR && CI.InstClass == DS_WRITE) 1003 return false; 1004 1005 // We need to go through the list of instructions that we plan to 1006 // move and make sure they are all safe to move down past the merged 1007 // instruction. 1008 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 1009 1010 // Call offsetsCanBeCombined with modify = true so that the offsets are 1011 // correct for the new instruction. This should return true, because 1012 // this function should only be called on CombineInfo objects that 1013 // have already been confirmed to be mergeable. 1014 if (CI.InstClass != MIMG) 1015 offsetsCanBeCombined(CI, *STM, Paired, true); 1016 return true; 1017 } 1018 return false; 1019 } 1020 1021 // We've found a load/store that we couldn't merge for some reason. 1022 // We could potentially keep looking, but we'd need to make sure that 1023 // it was safe to move I and also all the instruction in InstsToMove 1024 // down past this instruction. 1025 // check if we can move I across MBBI and if we can move all I's users 1026 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 1027 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 1028 break; 1029 } 1030 return false; 1031 } 1032 1033 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1034 if (STM->ldsRequiresM0Init()) 1035 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1036 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1037 } 1038 1039 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1040 if (STM->ldsRequiresM0Init()) 1041 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1042 1043 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1044 : AMDGPU::DS_READ2ST64_B64_gfx9; 1045 } 1046 1047 MachineBasicBlock::iterator 1048 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1049 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1050 MachineBasicBlock *MBB = CI.I->getParent(); 1051 1052 // Be careful, since the addresses could be subregisters themselves in weird 1053 // cases, like vectors of pointers. 1054 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1055 1056 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1057 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1058 1059 unsigned NewOffset0 = CI.Offset; 1060 unsigned NewOffset1 = Paired.Offset; 1061 unsigned Opc = 1062 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1063 1064 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1065 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1066 1067 if (NewOffset0 > NewOffset1) { 1068 // Canonicalize the merged instruction so the smaller offset comes first. 1069 std::swap(NewOffset0, NewOffset1); 1070 std::swap(SubRegIdx0, SubRegIdx1); 1071 } 1072 1073 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1074 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1075 1076 const MCInstrDesc &Read2Desc = TII->get(Opc); 1077 1078 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1079 Register DestReg = MRI->createVirtualRegister(SuperRC); 1080 1081 DebugLoc DL = CI.I->getDebugLoc(); 1082 1083 Register BaseReg = AddrReg->getReg(); 1084 unsigned BaseSubReg = AddrReg->getSubReg(); 1085 unsigned BaseRegFlags = 0; 1086 if (CI.BaseOff) { 1087 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1088 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1089 .addImm(CI.BaseOff); 1090 1091 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1092 BaseRegFlags = RegState::Kill; 1093 1094 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1095 .addReg(ImmReg) 1096 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1097 .addImm(0); // clamp bit 1098 BaseSubReg = 0; 1099 } 1100 1101 MachineInstrBuilder Read2 = 1102 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1103 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1104 .addImm(NewOffset0) // offset0 1105 .addImm(NewOffset1) // offset1 1106 .addImm(0) // gds 1107 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1108 1109 (void)Read2; 1110 1111 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1112 1113 // Copy to the old destination registers. 1114 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1115 .add(*Dest0) // Copy to same destination including flags and sub reg. 1116 .addReg(DestReg, 0, SubRegIdx0); 1117 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1118 .add(*Dest1) 1119 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1120 1121 moveInstsAfter(Copy1, InstsToMove); 1122 1123 CI.I->eraseFromParent(); 1124 Paired.I->eraseFromParent(); 1125 1126 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1127 return Read2; 1128 } 1129 1130 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1131 if (STM->ldsRequiresM0Init()) 1132 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1133 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1134 : AMDGPU::DS_WRITE2_B64_gfx9; 1135 } 1136 1137 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1138 if (STM->ldsRequiresM0Init()) 1139 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1140 : AMDGPU::DS_WRITE2ST64_B64; 1141 1142 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1143 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1144 } 1145 1146 MachineBasicBlock::iterator 1147 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1148 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1149 MachineBasicBlock *MBB = CI.I->getParent(); 1150 1151 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1152 // sure we preserve the subregister index and any register flags set on them. 1153 const MachineOperand *AddrReg = 1154 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1155 const MachineOperand *Data0 = 1156 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1157 const MachineOperand *Data1 = 1158 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1159 1160 unsigned NewOffset0 = CI.Offset; 1161 unsigned NewOffset1 = Paired.Offset; 1162 unsigned Opc = 1163 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1164 1165 if (NewOffset0 > NewOffset1) { 1166 // Canonicalize the merged instruction so the smaller offset comes first. 1167 std::swap(NewOffset0, NewOffset1); 1168 std::swap(Data0, Data1); 1169 } 1170 1171 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1172 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1173 1174 const MCInstrDesc &Write2Desc = TII->get(Opc); 1175 DebugLoc DL = CI.I->getDebugLoc(); 1176 1177 Register BaseReg = AddrReg->getReg(); 1178 unsigned BaseSubReg = AddrReg->getSubReg(); 1179 unsigned BaseRegFlags = 0; 1180 if (CI.BaseOff) { 1181 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1182 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1183 .addImm(CI.BaseOff); 1184 1185 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1186 BaseRegFlags = RegState::Kill; 1187 1188 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1189 .addReg(ImmReg) 1190 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1191 .addImm(0); // clamp bit 1192 BaseSubReg = 0; 1193 } 1194 1195 MachineInstrBuilder Write2 = 1196 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1197 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1198 .add(*Data0) // data0 1199 .add(*Data1) // data1 1200 .addImm(NewOffset0) // offset0 1201 .addImm(NewOffset1) // offset1 1202 .addImm(0) // gds 1203 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1204 1205 moveInstsAfter(Write2, InstsToMove); 1206 1207 CI.I->eraseFromParent(); 1208 Paired.I->eraseFromParent(); 1209 1210 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1211 return Write2; 1212 } 1213 1214 MachineBasicBlock::iterator 1215 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1216 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1217 MachineBasicBlock *MBB = CI.I->getParent(); 1218 DebugLoc DL = CI.I->getDebugLoc(); 1219 const unsigned Opcode = getNewOpcode(CI, Paired); 1220 1221 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1222 1223 Register DestReg = MRI->createVirtualRegister(SuperRC); 1224 unsigned MergedDMask = CI.DMask | Paired.DMask; 1225 unsigned DMaskIdx = 1226 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1227 1228 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1229 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1230 if (I == DMaskIdx) 1231 MIB.addImm(MergedDMask); 1232 else 1233 MIB.add((*CI.I).getOperand(I)); 1234 } 1235 1236 // It shouldn't be possible to get this far if the two instructions 1237 // don't have a single memoperand, because MachineInstr::mayAlias() 1238 // will return true if this is the case. 1239 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1240 1241 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1242 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1243 1244 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1245 1246 unsigned SubRegIdx0, SubRegIdx1; 1247 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1248 1249 // Copy to the old destination registers. 1250 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1251 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1252 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1253 1254 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1255 .add(*Dest0) // Copy to same destination including flags and sub reg. 1256 .addReg(DestReg, 0, SubRegIdx0); 1257 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1258 .add(*Dest1) 1259 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1260 1261 moveInstsAfter(Copy1, InstsToMove); 1262 1263 CI.I->eraseFromParent(); 1264 Paired.I->eraseFromParent(); 1265 return New; 1266 } 1267 1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1269 CombineInfo &CI, CombineInfo &Paired, 1270 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1271 MachineBasicBlock *MBB = CI.I->getParent(); 1272 DebugLoc DL = CI.I->getDebugLoc(); 1273 const unsigned Opcode = getNewOpcode(CI, Paired); 1274 1275 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1276 1277 Register DestReg = MRI->createVirtualRegister(SuperRC); 1278 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1279 1280 // It shouldn't be possible to get this far if the two instructions 1281 // don't have a single memoperand, because MachineInstr::mayAlias() 1282 // will return true if this is the case. 1283 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1284 1285 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1286 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1287 1288 MachineInstr *New = 1289 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1290 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1291 .addImm(MergedOffset) // offset 1292 .addImm(CI.CPol) // cpol 1293 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1294 1295 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1296 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1297 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1298 1299 // Copy to the old destination registers. 1300 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1301 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1302 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1303 1304 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1305 .add(*Dest0) // Copy to same destination including flags and sub reg. 1306 .addReg(DestReg, 0, SubRegIdx0); 1307 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1308 .add(*Dest1) 1309 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1310 1311 moveInstsAfter(Copy1, InstsToMove); 1312 1313 CI.I->eraseFromParent(); 1314 Paired.I->eraseFromParent(); 1315 return New; 1316 } 1317 1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1319 CombineInfo &CI, CombineInfo &Paired, 1320 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1321 MachineBasicBlock *MBB = CI.I->getParent(); 1322 DebugLoc DL = CI.I->getDebugLoc(); 1323 1324 const unsigned Opcode = getNewOpcode(CI, Paired); 1325 1326 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1327 1328 // Copy to the new source register. 1329 Register DestReg = MRI->createVirtualRegister(SuperRC); 1330 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1331 1332 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1333 1334 AddressRegs Regs = getRegs(Opcode, *TII); 1335 1336 if (Regs.VAddr) 1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1338 1339 // It shouldn't be possible to get this far if the two instructions 1340 // don't have a single memoperand, because MachineInstr::mayAlias() 1341 // will return true if this is the case. 1342 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1343 1344 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1345 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1346 1347 MachineInstr *New = 1348 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1349 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1350 .addImm(MergedOffset) // offset 1351 .addImm(CI.CPol) // cpol 1352 .addImm(0) // tfe 1353 .addImm(0) // swz 1354 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1355 1356 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1357 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1358 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1359 1360 // Copy to the old destination registers. 1361 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1362 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1363 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1364 1365 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1366 .add(*Dest0) // Copy to same destination including flags and sub reg. 1367 .addReg(DestReg, 0, SubRegIdx0); 1368 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1369 .add(*Dest1) 1370 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1371 1372 moveInstsAfter(Copy1, InstsToMove); 1373 1374 CI.I->eraseFromParent(); 1375 Paired.I->eraseFromParent(); 1376 return New; 1377 } 1378 1379 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1380 CombineInfo &CI, CombineInfo &Paired, 1381 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1382 MachineBasicBlock *MBB = CI.I->getParent(); 1383 DebugLoc DL = CI.I->getDebugLoc(); 1384 1385 const unsigned Opcode = getNewOpcode(CI, Paired); 1386 1387 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1388 1389 // Copy to the new source register. 1390 Register DestReg = MRI->createVirtualRegister(SuperRC); 1391 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1392 1393 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1394 1395 AddressRegs Regs = getRegs(Opcode, *TII); 1396 1397 if (Regs.VAddr) 1398 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1399 1400 unsigned JoinedFormat = 1401 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1402 1403 // It shouldn't be possible to get this far if the two instructions 1404 // don't have a single memoperand, because MachineInstr::mayAlias() 1405 // will return true if this is the case. 1406 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1407 1408 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1409 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1410 1411 MachineInstr *New = 1412 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1413 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1414 .addImm(MergedOffset) // offset 1415 .addImm(JoinedFormat) // format 1416 .addImm(CI.CPol) // cpol 1417 .addImm(0) // tfe 1418 .addImm(0) // swz 1419 .addMemOperand( 1420 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1421 1422 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1423 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1424 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1425 1426 // Copy to the old destination registers. 1427 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1428 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1429 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1430 1431 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1432 .add(*Dest0) // Copy to same destination including flags and sub reg. 1433 .addReg(DestReg, 0, SubRegIdx0); 1434 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1435 .add(*Dest1) 1436 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1437 1438 moveInstsAfter(Copy1, InstsToMove); 1439 1440 CI.I->eraseFromParent(); 1441 Paired.I->eraseFromParent(); 1442 return New; 1443 } 1444 1445 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1446 CombineInfo &CI, CombineInfo &Paired, 1447 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1448 MachineBasicBlock *MBB = CI.I->getParent(); 1449 DebugLoc DL = CI.I->getDebugLoc(); 1450 1451 const unsigned Opcode = getNewOpcode(CI, Paired); 1452 1453 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1454 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1455 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1456 1457 // Copy to the new source register. 1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1459 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1460 1461 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1462 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1463 1464 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1465 .add(*Src0) 1466 .addImm(SubRegIdx0) 1467 .add(*Src1) 1468 .addImm(SubRegIdx1); 1469 1470 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1471 .addReg(SrcReg, RegState::Kill); 1472 1473 AddressRegs Regs = getRegs(Opcode, *TII); 1474 1475 if (Regs.VAddr) 1476 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1477 1478 unsigned JoinedFormat = 1479 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1480 1481 // It shouldn't be possible to get this far if the two instructions 1482 // don't have a single memoperand, because MachineInstr::mayAlias() 1483 // will return true if this is the case. 1484 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1485 1486 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1487 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1488 1489 MachineInstr *New = 1490 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1491 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1492 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1493 .addImm(JoinedFormat) // format 1494 .addImm(CI.CPol) // cpol 1495 .addImm(0) // tfe 1496 .addImm(0) // swz 1497 .addMemOperand( 1498 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1499 1500 moveInstsAfter(MIB, InstsToMove); 1501 1502 CI.I->eraseFromParent(); 1503 Paired.I->eraseFromParent(); 1504 return New; 1505 } 1506 1507 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1508 const CombineInfo &Paired) { 1509 const unsigned Width = CI.Width + Paired.Width; 1510 1511 switch (CI.InstClass) { 1512 default: 1513 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1514 // FIXME: Handle d16 correctly 1515 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1516 Width); 1517 case TBUFFER_LOAD: 1518 case TBUFFER_STORE: 1519 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1520 Width); 1521 1522 case UNKNOWN: 1523 llvm_unreachable("Unknown instruction class"); 1524 case S_BUFFER_LOAD_IMM: 1525 switch (Width) { 1526 default: 1527 return 0; 1528 case 2: 1529 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1530 case 4: 1531 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1532 case 8: 1533 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1534 } 1535 case MIMG: 1536 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1537 "No overlaps"); 1538 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1539 } 1540 } 1541 1542 std::pair<unsigned, unsigned> 1543 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1544 const CombineInfo &Paired) { 1545 1546 assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero"); 1547 1548 bool ReverseOrder; 1549 if (CI.InstClass == MIMG) { 1550 assert( 1551 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1552 "No overlaps"); 1553 ReverseOrder = CI.DMask > Paired.DMask; 1554 } else 1555 ReverseOrder = CI.Offset > Paired.Offset; 1556 1557 unsigned Idx0; 1558 unsigned Idx1; 1559 1560 if (CI.Width + Paired.Width > 4) { 1561 assert(CI.Width == 4 && Paired.Width == 4); 1562 1563 if (ReverseOrder) { 1564 Idx1 = AMDGPU::sub0_sub1_sub2_sub3; 1565 Idx0 = AMDGPU::sub4_sub5_sub6_sub7; 1566 } else { 1567 Idx0 = AMDGPU::sub0_sub1_sub2_sub3; 1568 Idx1 = AMDGPU::sub4_sub5_sub6_sub7; 1569 } 1570 } else { 1571 static const unsigned Idxs[4][4] = { 1572 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1573 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1574 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1575 {AMDGPU::sub3, 0, 0, 0}, 1576 }; 1577 1578 assert(CI.Width >= 1 && CI.Width <= 3); 1579 assert(Paired.Width >= 1 && Paired.Width <= 3); 1580 1581 if (ReverseOrder) { 1582 Idx1 = Idxs[0][Paired.Width - 1]; 1583 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1584 } else { 1585 Idx0 = Idxs[0][CI.Width - 1]; 1586 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1587 } 1588 } 1589 1590 return std::make_pair(Idx0, Idx1); 1591 } 1592 1593 const TargetRegisterClass * 1594 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1595 const CombineInfo &Paired) { 1596 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1597 switch (CI.Width + Paired.Width) { 1598 default: 1599 return nullptr; 1600 case 2: 1601 return &AMDGPU::SReg_64_XEXECRegClass; 1602 case 4: 1603 return &AMDGPU::SGPR_128RegClass; 1604 case 8: 1605 return &AMDGPU::SGPR_256RegClass; 1606 case 16: 1607 return &AMDGPU::SGPR_512RegClass; 1608 } 1609 } 1610 1611 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1612 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1613 ? TRI->getAGPRClassForBitWidth(BitWidth) 1614 : TRI->getVGPRClassForBitWidth(BitWidth); 1615 } 1616 1617 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1618 CombineInfo &CI, CombineInfo &Paired, 1619 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1620 MachineBasicBlock *MBB = CI.I->getParent(); 1621 DebugLoc DL = CI.I->getDebugLoc(); 1622 1623 const unsigned Opcode = getNewOpcode(CI, Paired); 1624 1625 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1626 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1627 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1628 1629 // Copy to the new source register. 1630 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1631 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1632 1633 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1634 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1635 1636 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1637 .add(*Src0) 1638 .addImm(SubRegIdx0) 1639 .add(*Src1) 1640 .addImm(SubRegIdx1); 1641 1642 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1643 .addReg(SrcReg, RegState::Kill); 1644 1645 AddressRegs Regs = getRegs(Opcode, *TII); 1646 1647 if (Regs.VAddr) 1648 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1649 1650 1651 // It shouldn't be possible to get this far if the two instructions 1652 // don't have a single memoperand, because MachineInstr::mayAlias() 1653 // will return true if this is the case. 1654 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1655 1656 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1657 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1658 1659 MachineInstr *New = 1660 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1661 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1662 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1663 .addImm(CI.CPol) // cpol 1664 .addImm(0) // tfe 1665 .addImm(0) // swz 1666 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1667 1668 moveInstsAfter(MIB, InstsToMove); 1669 1670 CI.I->eraseFromParent(); 1671 Paired.I->eraseFromParent(); 1672 return New; 1673 } 1674 1675 MachineOperand 1676 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1677 APInt V(32, Val, true); 1678 if (TII->isInlineConstant(V)) 1679 return MachineOperand::CreateImm(Val); 1680 1681 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1682 MachineInstr *Mov = 1683 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1684 TII->get(AMDGPU::S_MOV_B32), Reg) 1685 .addImm(Val); 1686 (void)Mov; 1687 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1688 return MachineOperand::CreateReg(Reg, false); 1689 } 1690 1691 // Compute base address using Addr and return the final register. 1692 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1693 const MemAddress &Addr) const { 1694 MachineBasicBlock *MBB = MI.getParent(); 1695 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1696 DebugLoc DL = MI.getDebugLoc(); 1697 1698 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1699 Addr.Base.LoSubReg) && 1700 "Expected 32-bit Base-Register-Low!!"); 1701 1702 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1703 Addr.Base.HiSubReg) && 1704 "Expected 32-bit Base-Register-Hi!!"); 1705 1706 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1707 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1708 MachineOperand OffsetHi = 1709 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1710 1711 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1712 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1713 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1714 1715 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1716 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1717 MachineInstr *LoHalf = 1718 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1719 .addReg(CarryReg, RegState::Define) 1720 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1721 .add(OffsetLo) 1722 .addImm(0); // clamp bit 1723 (void)LoHalf; 1724 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1725 1726 MachineInstr *HiHalf = 1727 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1728 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1729 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1730 .add(OffsetHi) 1731 .addReg(CarryReg, RegState::Kill) 1732 .addImm(0); // clamp bit 1733 (void)HiHalf; 1734 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1735 1736 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1737 MachineInstr *FullBase = 1738 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1739 .addReg(DestSub0) 1740 .addImm(AMDGPU::sub0) 1741 .addReg(DestSub1) 1742 .addImm(AMDGPU::sub1); 1743 (void)FullBase; 1744 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1745 1746 return FullDestReg; 1747 } 1748 1749 // Update base and offset with the NewBase and NewOffset in MI. 1750 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1751 Register NewBase, 1752 int32_t NewOffset) const { 1753 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1754 Base->setReg(NewBase); 1755 Base->setIsKill(false); 1756 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1757 } 1758 1759 Optional<int32_t> 1760 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1761 if (Op.isImm()) 1762 return Op.getImm(); 1763 1764 if (!Op.isReg()) 1765 return None; 1766 1767 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1768 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1769 !Def->getOperand(1).isImm()) 1770 return None; 1771 1772 return Def->getOperand(1).getImm(); 1773 } 1774 1775 // Analyze Base and extracts: 1776 // - 32bit base registers, subregisters 1777 // - 64bit constant offset 1778 // Expecting base computation as: 1779 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1780 // %LO:vgpr_32, %c:sreg_64_xexec = 1781 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1782 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1783 // %Base:vreg_64 = 1784 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1785 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1786 MemAddress &Addr) const { 1787 if (!Base.isReg()) 1788 return; 1789 1790 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1791 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1792 || Def->getNumOperands() != 5) 1793 return; 1794 1795 MachineOperand BaseLo = Def->getOperand(1); 1796 MachineOperand BaseHi = Def->getOperand(3); 1797 if (!BaseLo.isReg() || !BaseHi.isReg()) 1798 return; 1799 1800 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1801 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1802 1803 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1804 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1805 return; 1806 1807 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1808 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1809 1810 auto Offset0P = extractConstOffset(*Src0); 1811 if (Offset0P) 1812 BaseLo = *Src1; 1813 else { 1814 if (!(Offset0P = extractConstOffset(*Src1))) 1815 return; 1816 BaseLo = *Src0; 1817 } 1818 1819 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1820 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1821 1822 if (Src0->isImm()) 1823 std::swap(Src0, Src1); 1824 1825 if (!Src1->isImm()) 1826 return; 1827 1828 uint64_t Offset1 = Src1->getImm(); 1829 BaseHi = *Src0; 1830 1831 Addr.Base.LoReg = BaseLo.getReg(); 1832 Addr.Base.HiReg = BaseHi.getReg(); 1833 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1834 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1835 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1836 } 1837 1838 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1839 MachineInstr &MI, 1840 MemInfoMap &Visited, 1841 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1842 1843 if (!(MI.mayLoad() ^ MI.mayStore())) 1844 return false; 1845 1846 // TODO: Support flat and scratch. 1847 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1848 return false; 1849 1850 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1851 return false; 1852 1853 if (AnchorList.count(&MI)) 1854 return false; 1855 1856 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1857 1858 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1859 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1860 return false; 1861 } 1862 1863 // Step1: Find the base-registers and a 64bit constant offset. 1864 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1865 MemAddress MAddr; 1866 if (Visited.find(&MI) == Visited.end()) { 1867 processBaseWithConstOffset(Base, MAddr); 1868 Visited[&MI] = MAddr; 1869 } else 1870 MAddr = Visited[&MI]; 1871 1872 if (MAddr.Offset == 0) { 1873 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1874 " constant offsets that can be promoted.\n";); 1875 return false; 1876 } 1877 1878 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1879 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1880 1881 // Step2: Traverse through MI's basic block and find an anchor(that has the 1882 // same base-registers) with the highest 13bit distance from MI's offset. 1883 // E.g. (64bit loads) 1884 // bb: 1885 // addr1 = &a + 4096; load1 = load(addr1, 0) 1886 // addr2 = &a + 6144; load2 = load(addr2, 0) 1887 // addr3 = &a + 8192; load3 = load(addr3, 0) 1888 // addr4 = &a + 10240; load4 = load(addr4, 0) 1889 // addr5 = &a + 12288; load5 = load(addr5, 0) 1890 // 1891 // Starting from the first load, the optimization will try to find a new base 1892 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1893 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1894 // as the new-base(anchor) because of the maximum distance which can 1895 // accomodate more intermediate bases presumeably. 1896 // 1897 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1898 // (&a + 8192) for load1, load2, load4. 1899 // addr = &a + 8192 1900 // load1 = load(addr, -4096) 1901 // load2 = load(addr, -2048) 1902 // load3 = load(addr, 0) 1903 // load4 = load(addr, 2048) 1904 // addr5 = &a + 12288; load5 = load(addr5, 0) 1905 // 1906 MachineInstr *AnchorInst = nullptr; 1907 MemAddress AnchorAddr; 1908 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1909 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1910 1911 MachineBasicBlock *MBB = MI.getParent(); 1912 MachineBasicBlock::iterator E = MBB->end(); 1913 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1914 ++MBBI; 1915 const SITargetLowering *TLI = 1916 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1917 1918 for ( ; MBBI != E; ++MBBI) { 1919 MachineInstr &MINext = *MBBI; 1920 // TODO: Support finding an anchor(with same base) from store addresses or 1921 // any other load addresses where the opcodes are different. 1922 if (MINext.getOpcode() != MI.getOpcode() || 1923 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1924 continue; 1925 1926 const MachineOperand &BaseNext = 1927 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1928 MemAddress MAddrNext; 1929 if (Visited.find(&MINext) == Visited.end()) { 1930 processBaseWithConstOffset(BaseNext, MAddrNext); 1931 Visited[&MINext] = MAddrNext; 1932 } else 1933 MAddrNext = Visited[&MINext]; 1934 1935 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1936 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1937 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1938 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1939 continue; 1940 1941 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1942 1943 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1944 TargetLoweringBase::AddrMode AM; 1945 AM.HasBaseReg = true; 1946 AM.BaseOffs = Dist; 1947 if (TLI->isLegalGlobalAddressingMode(AM) && 1948 (uint32_t)std::abs(Dist) > MaxDist) { 1949 MaxDist = std::abs(Dist); 1950 1951 AnchorAddr = MAddrNext; 1952 AnchorInst = &MINext; 1953 } 1954 } 1955 1956 if (AnchorInst) { 1957 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1958 AnchorInst->dump()); 1959 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1960 << AnchorAddr.Offset << "\n\n"); 1961 1962 // Instead of moving up, just re-compute anchor-instruction's base address. 1963 Register Base = computeBase(MI, AnchorAddr); 1964 1965 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1966 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1967 1968 for (auto P : InstsWCommonBase) { 1969 TargetLoweringBase::AddrMode AM; 1970 AM.HasBaseReg = true; 1971 AM.BaseOffs = P.second - AnchorAddr.Offset; 1972 1973 if (TLI->isLegalGlobalAddressingMode(AM)) { 1974 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1975 dbgs() << ")"; P.first->dump()); 1976 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1977 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1978 } 1979 } 1980 AnchorList.insert(AnchorInst); 1981 return true; 1982 } 1983 1984 return false; 1985 } 1986 1987 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1988 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1989 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1990 if (AddrList.front().InstClass == CI.InstClass && 1991 AddrList.front().hasSameBaseAddress(*CI.I)) { 1992 AddrList.emplace_back(CI); 1993 return; 1994 } 1995 } 1996 1997 // Base address not found, so add a new list. 1998 MergeableInsts.emplace_back(1, CI); 1999 } 2000 2001 std::pair<MachineBasicBlock::iterator, bool> 2002 SILoadStoreOptimizer::collectMergeableInsts( 2003 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2004 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2005 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2006 bool Modified = false; 2007 2008 // Sort potential mergeable instructions into lists. One list per base address. 2009 unsigned Order = 0; 2010 MachineBasicBlock::iterator BlockI = Begin; 2011 for (; BlockI != End; ++BlockI) { 2012 MachineInstr &MI = *BlockI; 2013 2014 // We run this before checking if an address is mergeable, because it can produce 2015 // better code even if the instructions aren't mergeable. 2016 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2017 Modified = true; 2018 2019 // Don't combine if volatile. We also won't be able to merge across this, so 2020 // break the search. We can look after this barrier for separate merges. 2021 if (MI.hasOrderedMemoryRef()) { 2022 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 2023 2024 // Search will resume after this instruction in a separate merge list. 2025 ++BlockI; 2026 break; 2027 } 2028 2029 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2030 if (InstClass == UNKNOWN) 2031 continue; 2032 2033 CombineInfo CI; 2034 CI.setMI(MI, *TII, *STM); 2035 CI.Order = Order++; 2036 2037 if (!CI.hasMergeableAddress(*MRI)) 2038 continue; 2039 2040 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2041 2042 addInstToMergeableList(CI, MergeableInsts); 2043 } 2044 2045 // At this point we have lists of Mergeable instructions. 2046 // 2047 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2048 // list try to find an instruction that can be merged with I. If an instruction 2049 // is found, it is stored in the Paired field. If no instructions are found, then 2050 // the CombineInfo object is deleted from the list. 2051 2052 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2053 E = MergeableInsts.end(); I != E;) { 2054 2055 std::list<CombineInfo> &MergeList = *I; 2056 if (MergeList.size() <= 1) { 2057 // This means we have found only one instruction with a given address 2058 // that can be merged, and we need at least 2 instructions to do a merge, 2059 // so this list can be discarded. 2060 I = MergeableInsts.erase(I); 2061 continue; 2062 } 2063 2064 // Sort the lists by offsets, this way mergeable instructions will be 2065 // adjacent to each other in the list, which will make it easier to find 2066 // matches. 2067 MergeList.sort( 2068 [] (const CombineInfo &A, const CombineInfo &B) { 2069 return A.Offset < B.Offset; 2070 }); 2071 ++I; 2072 } 2073 2074 return std::make_pair(BlockI, Modified); 2075 } 2076 2077 // Scan through looking for adjacent LDS operations with constant offsets from 2078 // the same base register. We rely on the scheduler to do the hard work of 2079 // clustering nearby loads, and assume these are all adjacent. 2080 bool SILoadStoreOptimizer::optimizeBlock( 2081 std::list<std::list<CombineInfo> > &MergeableInsts) { 2082 bool Modified = false; 2083 2084 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2085 E = MergeableInsts.end(); I != E;) { 2086 std::list<CombineInfo> &MergeList = *I; 2087 2088 bool OptimizeListAgain = false; 2089 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2090 // We weren't able to make any changes, so delete the list so we don't 2091 // process the same instructions the next time we try to optimize this 2092 // block. 2093 I = MergeableInsts.erase(I); 2094 continue; 2095 } 2096 2097 Modified = true; 2098 2099 // We made changes, but also determined that there were no more optimization 2100 // opportunities, so we don't need to reprocess the list 2101 if (!OptimizeListAgain) { 2102 I = MergeableInsts.erase(I); 2103 continue; 2104 } 2105 OptimizeAgain = true; 2106 } 2107 return Modified; 2108 } 2109 2110 bool 2111 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2112 std::list<CombineInfo> &MergeList, 2113 bool &OptimizeListAgain) { 2114 if (MergeList.empty()) 2115 return false; 2116 2117 bool Modified = false; 2118 2119 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2120 Next = std::next(I)) { 2121 2122 auto First = I; 2123 auto Second = Next; 2124 2125 if ((*First).Order > (*Second).Order) 2126 std::swap(First, Second); 2127 CombineInfo &CI = *First; 2128 CombineInfo &Paired = *Second; 2129 2130 SmallVector<MachineInstr *, 8> InstsToMove; 2131 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2132 ++I; 2133 continue; 2134 } 2135 2136 Modified = true; 2137 2138 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2139 2140 switch (CI.InstClass) { 2141 default: 2142 llvm_unreachable("unknown InstClass"); 2143 break; 2144 case DS_READ: { 2145 MachineBasicBlock::iterator NewMI = 2146 mergeRead2Pair(CI, Paired, InstsToMove); 2147 CI.setMI(NewMI, *TII, *STM); 2148 break; 2149 } 2150 case DS_WRITE: { 2151 MachineBasicBlock::iterator NewMI = 2152 mergeWrite2Pair(CI, Paired, InstsToMove); 2153 CI.setMI(NewMI, *TII, *STM); 2154 break; 2155 } 2156 case S_BUFFER_LOAD_IMM: { 2157 MachineBasicBlock::iterator NewMI = 2158 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2159 CI.setMI(NewMI, *TII, *STM); 2160 OptimizeListAgain |= (CI.Width + Paired.Width) < 8; 2161 break; 2162 } 2163 case BUFFER_LOAD: { 2164 MachineBasicBlock::iterator NewMI = 2165 mergeBufferLoadPair(CI, Paired, InstsToMove); 2166 CI.setMI(NewMI, *TII, *STM); 2167 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2168 break; 2169 } 2170 case BUFFER_STORE: { 2171 MachineBasicBlock::iterator NewMI = 2172 mergeBufferStorePair(CI, Paired, InstsToMove); 2173 CI.setMI(NewMI, *TII, *STM); 2174 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2175 break; 2176 } 2177 case MIMG: { 2178 MachineBasicBlock::iterator NewMI = 2179 mergeImagePair(CI, Paired, InstsToMove); 2180 CI.setMI(NewMI, *TII, *STM); 2181 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2182 break; 2183 } 2184 case TBUFFER_LOAD: { 2185 MachineBasicBlock::iterator NewMI = 2186 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2187 CI.setMI(NewMI, *TII, *STM); 2188 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2189 break; 2190 } 2191 case TBUFFER_STORE: { 2192 MachineBasicBlock::iterator NewMI = 2193 mergeTBufferStorePair(CI, Paired, InstsToMove); 2194 CI.setMI(NewMI, *TII, *STM); 2195 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2196 break; 2197 } 2198 } 2199 CI.Order = Paired.Order; 2200 if (I == Second) 2201 I = Next; 2202 2203 MergeList.erase(Second); 2204 } 2205 2206 return Modified; 2207 } 2208 2209 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2210 if (skipFunction(MF.getFunction())) 2211 return false; 2212 2213 STM = &MF.getSubtarget<GCNSubtarget>(); 2214 if (!STM->loadStoreOptEnabled()) 2215 return false; 2216 2217 TII = STM->getInstrInfo(); 2218 TRI = &TII->getRegisterInfo(); 2219 2220 MRI = &MF.getRegInfo(); 2221 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2222 2223 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2224 2225 bool Modified = false; 2226 2227 // Contains the list of instructions for which constant offsets are being 2228 // promoted to the IMM. This is tracked for an entire block at time. 2229 SmallPtrSet<MachineInstr *, 4> AnchorList; 2230 MemInfoMap Visited; 2231 2232 for (MachineBasicBlock &MBB : MF) { 2233 MachineBasicBlock::iterator SectionEnd; 2234 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2235 I = SectionEnd) { 2236 bool CollectModified; 2237 std::list<std::list<CombineInfo>> MergeableInsts; 2238 2239 // First pass: Collect list of all instructions we know how to merge in a 2240 // subset of the block. 2241 std::tie(SectionEnd, CollectModified) = 2242 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2243 2244 Modified |= CollectModified; 2245 2246 do { 2247 OptimizeAgain = false; 2248 Modified |= optimizeBlock(MergeableInsts); 2249 } while (OptimizeAgain); 2250 } 2251 2252 Visited.clear(); 2253 AnchorList.clear(); 2254 } 2255 2256 return Modified; 2257 } 2258