1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 GLOBAL_LOAD_SADDR, 83 GLOBAL_STORE_SADDR, 84 FLAT_LOAD, 85 FLAT_STORE, 86 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 87 GLOBAL_STORE // any CombineInfo, they are only ever returned by 88 // getCommonInstClass. 89 }; 90 91 struct AddressRegs { 92 unsigned char NumVAddrs = 0; 93 bool SBase = false; 94 bool SRsrc = false; 95 bool SOffset = false; 96 bool SAddr = false; 97 bool VAddr = false; 98 bool Addr = false; 99 bool SSamp = false; 100 }; 101 102 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 103 const unsigned MaxAddressRegs = 12 + 1 + 1; 104 105 class SILoadStoreOptimizer : public MachineFunctionPass { 106 struct CombineInfo { 107 MachineBasicBlock::iterator I; 108 unsigned EltSize; 109 unsigned Offset; 110 unsigned Width; 111 unsigned Format; 112 unsigned BaseOff; 113 unsigned DMask; 114 InstClassEnum InstClass; 115 unsigned CPol = 0; 116 bool IsAGPR; 117 bool UseST64; 118 int AddrIdx[MaxAddressRegs]; 119 const MachineOperand *AddrReg[MaxAddressRegs]; 120 unsigned NumAddresses; 121 unsigned Order; 122 123 bool hasSameBaseAddress(const MachineInstr &MI) { 124 for (unsigned i = 0; i < NumAddresses; i++) { 125 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 126 127 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 128 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 129 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 130 return false; 131 } 132 continue; 133 } 134 135 // Check same base pointer. Be careful of subregisters, which can occur 136 // with vectors of pointers. 137 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 138 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 139 return false; 140 } 141 } 142 return true; 143 } 144 145 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 146 for (unsigned i = 0; i < NumAddresses; ++i) { 147 const MachineOperand *AddrOp = AddrReg[i]; 148 // Immediates are always OK. 149 if (AddrOp->isImm()) 150 continue; 151 152 // Don't try to merge addresses that aren't either immediates or registers. 153 // TODO: Should be possible to merge FrameIndexes and maybe some other 154 // non-register 155 if (!AddrOp->isReg()) 156 return false; 157 158 // TODO: We should be able to merge physical reg addresses. 159 if (AddrOp->getReg().isPhysical()) 160 return false; 161 162 // If an address has only one use then there will be on other 163 // instructions with the same address, so we can't merge this one. 164 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 165 return false; 166 } 167 return true; 168 } 169 170 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 171 172 // Compare by pointer order. 173 bool operator<(const CombineInfo& Other) const { 174 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset; 175 } 176 }; 177 178 struct BaseRegisters { 179 Register LoReg; 180 Register HiReg; 181 182 unsigned LoSubReg = 0; 183 unsigned HiSubReg = 0; 184 }; 185 186 struct MemAddress { 187 BaseRegisters Base; 188 int64_t Offset = 0; 189 }; 190 191 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 192 193 private: 194 const GCNSubtarget *STM = nullptr; 195 const SIInstrInfo *TII = nullptr; 196 const SIRegisterInfo *TRI = nullptr; 197 MachineRegisterInfo *MRI = nullptr; 198 AliasAnalysis *AA = nullptr; 199 bool OptimizeAgain; 200 201 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 202 const DenseSet<Register> &ARegUses, 203 const MachineInstr &A, const MachineInstr &B) const; 204 static bool dmasksCanBeCombined(const CombineInfo &CI, 205 const SIInstrInfo &TII, 206 const CombineInfo &Paired); 207 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 208 CombineInfo &Paired, bool Modify = false); 209 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 210 const CombineInfo &Paired); 211 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 212 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 213 const CombineInfo &Paired); 214 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 215 const CombineInfo &Paired); 216 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 217 218 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 219 220 unsigned read2Opcode(unsigned EltSize) const; 221 unsigned read2ST64Opcode(unsigned EltSize) const; 222 MachineBasicBlock::iterator 223 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 224 MachineBasicBlock::iterator InsertBefore); 225 226 unsigned write2Opcode(unsigned EltSize) const; 227 unsigned write2ST64Opcode(unsigned EltSize) const; 228 MachineBasicBlock::iterator 229 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 230 MachineBasicBlock::iterator InsertBefore); 231 MachineBasicBlock::iterator 232 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 233 MachineBasicBlock::iterator InsertBefore); 234 MachineBasicBlock::iterator 235 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 236 MachineBasicBlock::iterator InsertBefore); 237 MachineBasicBlock::iterator 238 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 239 MachineBasicBlock::iterator InsertBefore); 240 MachineBasicBlock::iterator 241 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 242 MachineBasicBlock::iterator InsertBefore); 243 MachineBasicBlock::iterator 244 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 245 MachineBasicBlock::iterator InsertBefore); 246 MachineBasicBlock::iterator 247 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 248 MachineBasicBlock::iterator InsertBefore); 249 MachineBasicBlock::iterator 250 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired, 251 MachineBasicBlock::iterator InsertBefore); 252 MachineBasicBlock::iterator 253 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired, 254 MachineBasicBlock::iterator InsertBefore); 255 256 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 257 int32_t NewOffset) const; 258 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 259 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 260 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 261 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 262 /// Promotes constant offset to the immediate by adjusting the base. It 263 /// tries to use a base from the nearby instructions that allows it to have 264 /// a 13bit constant offset which gets promoted to the immediate. 265 bool promoteConstantOffsetToImm(MachineInstr &CI, 266 MemInfoMap &Visited, 267 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 268 void addInstToMergeableList(const CombineInfo &CI, 269 std::list<std::list<CombineInfo> > &MergeableInsts) const; 270 271 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 272 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 273 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 274 std::list<std::list<CombineInfo>> &MergeableInsts) const; 275 276 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI, 277 const CombineInfo &Paired); 278 279 static InstClassEnum getCommonInstClass(const CombineInfo &CI, 280 const CombineInfo &Paired); 281 282 public: 283 static char ID; 284 285 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 286 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 287 } 288 289 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 290 bool &OptimizeListAgain); 291 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 292 293 bool runOnMachineFunction(MachineFunction &MF) override; 294 295 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 296 297 void getAnalysisUsage(AnalysisUsage &AU) const override { 298 AU.setPreservesCFG(); 299 AU.addRequired<AAResultsWrapperPass>(); 300 301 MachineFunctionPass::getAnalysisUsage(AU); 302 } 303 304 MachineFunctionProperties getRequiredProperties() const override { 305 return MachineFunctionProperties() 306 .set(MachineFunctionProperties::Property::IsSSA); 307 } 308 }; 309 310 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 311 const unsigned Opc = MI.getOpcode(); 312 313 if (TII.isMUBUF(Opc)) { 314 // FIXME: Handle d16 correctly 315 return AMDGPU::getMUBUFElements(Opc); 316 } 317 if (TII.isMIMG(MI)) { 318 uint64_t DMaskImm = 319 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 320 return countPopulation(DMaskImm); 321 } 322 if (TII.isMTBUF(Opc)) { 323 return AMDGPU::getMTBUFElements(Opc); 324 } 325 326 switch (Opc) { 327 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 328 case AMDGPU::GLOBAL_LOAD_DWORD: 329 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 330 case AMDGPU::GLOBAL_STORE_DWORD: 331 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 332 case AMDGPU::FLAT_LOAD_DWORD: 333 case AMDGPU::FLAT_STORE_DWORD: 334 return 1; 335 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 336 case AMDGPU::GLOBAL_LOAD_DWORDX2: 337 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 338 case AMDGPU::GLOBAL_STORE_DWORDX2: 339 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 340 case AMDGPU::FLAT_LOAD_DWORDX2: 341 case AMDGPU::FLAT_STORE_DWORDX2: 342 return 2; 343 case AMDGPU::GLOBAL_LOAD_DWORDX3: 344 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 345 case AMDGPU::GLOBAL_STORE_DWORDX3: 346 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 347 case AMDGPU::FLAT_LOAD_DWORDX3: 348 case AMDGPU::FLAT_STORE_DWORDX3: 349 return 3; 350 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 351 case AMDGPU::GLOBAL_LOAD_DWORDX4: 352 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 353 case AMDGPU::GLOBAL_STORE_DWORDX4: 354 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 355 case AMDGPU::FLAT_LOAD_DWORDX4: 356 case AMDGPU::FLAT_STORE_DWORDX4: 357 return 4; 358 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 359 return 8; 360 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 361 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 362 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 363 case AMDGPU::DS_WRITE_B32_gfx9: 364 return 1; 365 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 366 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 367 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 368 case AMDGPU::DS_WRITE_B64_gfx9: 369 return 2; 370 default: 371 return 0; 372 } 373 } 374 375 /// Maps instruction opcode to enum InstClassEnum. 376 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 377 switch (Opc) { 378 default: 379 if (TII.isMUBUF(Opc)) { 380 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 381 default: 382 return UNKNOWN; 383 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 384 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 385 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 386 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 387 return BUFFER_LOAD; 388 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 389 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 390 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 391 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 392 return BUFFER_STORE; 393 } 394 } 395 if (TII.isMIMG(Opc)) { 396 // Ignore instructions encoded without vaddr. 397 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 398 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 399 return UNKNOWN; 400 // Ignore BVH instructions 401 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 402 return UNKNOWN; 403 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 404 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 405 TII.isGather4(Opc)) 406 return UNKNOWN; 407 return MIMG; 408 } 409 if (TII.isMTBUF(Opc)) { 410 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 411 default: 412 return UNKNOWN; 413 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 414 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 415 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 416 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 417 return TBUFFER_LOAD; 418 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 419 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 420 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 421 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 422 return TBUFFER_STORE; 423 } 424 } 425 return UNKNOWN; 426 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 427 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 428 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 429 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 430 return S_BUFFER_LOAD_IMM; 431 case AMDGPU::DS_READ_B32: 432 case AMDGPU::DS_READ_B32_gfx9: 433 case AMDGPU::DS_READ_B64: 434 case AMDGPU::DS_READ_B64_gfx9: 435 return DS_READ; 436 case AMDGPU::DS_WRITE_B32: 437 case AMDGPU::DS_WRITE_B32_gfx9: 438 case AMDGPU::DS_WRITE_B64: 439 case AMDGPU::DS_WRITE_B64_gfx9: 440 return DS_WRITE; 441 case AMDGPU::GLOBAL_LOAD_DWORD: 442 case AMDGPU::GLOBAL_LOAD_DWORDX2: 443 case AMDGPU::GLOBAL_LOAD_DWORDX3: 444 case AMDGPU::GLOBAL_LOAD_DWORDX4: 445 case AMDGPU::FLAT_LOAD_DWORD: 446 case AMDGPU::FLAT_LOAD_DWORDX2: 447 case AMDGPU::FLAT_LOAD_DWORDX3: 448 case AMDGPU::FLAT_LOAD_DWORDX4: 449 return FLAT_LOAD; 450 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 451 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 452 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 453 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 454 return GLOBAL_LOAD_SADDR; 455 case AMDGPU::GLOBAL_STORE_DWORD: 456 case AMDGPU::GLOBAL_STORE_DWORDX2: 457 case AMDGPU::GLOBAL_STORE_DWORDX3: 458 case AMDGPU::GLOBAL_STORE_DWORDX4: 459 case AMDGPU::FLAT_STORE_DWORD: 460 case AMDGPU::FLAT_STORE_DWORDX2: 461 case AMDGPU::FLAT_STORE_DWORDX3: 462 case AMDGPU::FLAT_STORE_DWORDX4: 463 return FLAT_STORE; 464 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 465 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 466 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 467 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 468 return GLOBAL_STORE_SADDR; 469 } 470 } 471 472 /// Determines instruction subclass from opcode. Only instructions 473 /// of the same subclass can be merged together. The merged instruction may have 474 /// a different subclass but must have the same class. 475 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 476 switch (Opc) { 477 default: 478 if (TII.isMUBUF(Opc)) 479 return AMDGPU::getMUBUFBaseOpcode(Opc); 480 if (TII.isMIMG(Opc)) { 481 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 482 assert(Info); 483 return Info->BaseOpcode; 484 } 485 if (TII.isMTBUF(Opc)) 486 return AMDGPU::getMTBUFBaseOpcode(Opc); 487 return -1; 488 case AMDGPU::DS_READ_B32: 489 case AMDGPU::DS_READ_B32_gfx9: 490 case AMDGPU::DS_READ_B64: 491 case AMDGPU::DS_READ_B64_gfx9: 492 case AMDGPU::DS_WRITE_B32: 493 case AMDGPU::DS_WRITE_B32_gfx9: 494 case AMDGPU::DS_WRITE_B64: 495 case AMDGPU::DS_WRITE_B64_gfx9: 496 return Opc; 497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 499 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 500 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 501 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 502 case AMDGPU::GLOBAL_LOAD_DWORD: 503 case AMDGPU::GLOBAL_LOAD_DWORDX2: 504 case AMDGPU::GLOBAL_LOAD_DWORDX3: 505 case AMDGPU::GLOBAL_LOAD_DWORDX4: 506 case AMDGPU::FLAT_LOAD_DWORD: 507 case AMDGPU::FLAT_LOAD_DWORDX2: 508 case AMDGPU::FLAT_LOAD_DWORDX3: 509 case AMDGPU::FLAT_LOAD_DWORDX4: 510 return AMDGPU::FLAT_LOAD_DWORD; 511 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 512 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 513 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 514 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 515 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR; 516 case AMDGPU::GLOBAL_STORE_DWORD: 517 case AMDGPU::GLOBAL_STORE_DWORDX2: 518 case AMDGPU::GLOBAL_STORE_DWORDX3: 519 case AMDGPU::GLOBAL_STORE_DWORDX4: 520 case AMDGPU::FLAT_STORE_DWORD: 521 case AMDGPU::FLAT_STORE_DWORDX2: 522 case AMDGPU::FLAT_STORE_DWORDX3: 523 case AMDGPU::FLAT_STORE_DWORDX4: 524 return AMDGPU::FLAT_STORE_DWORD; 525 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 526 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 527 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 528 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 529 return AMDGPU::GLOBAL_STORE_DWORD_SADDR; 530 } 531 } 532 533 // GLOBAL loads and stores are classified as FLAT initially. If both combined 534 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 535 // If either or both instructions are non segment specific FLAT the resulting 536 // combined operation will be FLAT, potentially promoting one of the GLOBAL 537 // operations to FLAT. 538 // For other instructions return the original unmodified class. 539 InstClassEnum 540 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI, 541 const CombineInfo &Paired) { 542 assert(CI.InstClass == Paired.InstClass); 543 544 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) && 545 SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I)) 546 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD; 547 548 return CI.InstClass; 549 } 550 551 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 552 AddressRegs Result; 553 554 if (TII.isMUBUF(Opc)) { 555 if (AMDGPU::getMUBUFHasVAddr(Opc)) 556 Result.VAddr = true; 557 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 558 Result.SRsrc = true; 559 if (AMDGPU::getMUBUFHasSoffset(Opc)) 560 Result.SOffset = true; 561 562 return Result; 563 } 564 565 if (TII.isMIMG(Opc)) { 566 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 567 if (VAddr0Idx >= 0) { 568 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 569 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 570 } else { 571 Result.VAddr = true; 572 } 573 Result.SRsrc = true; 574 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 575 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 576 Result.SSamp = true; 577 578 return Result; 579 } 580 if (TII.isMTBUF(Opc)) { 581 if (AMDGPU::getMTBUFHasVAddr(Opc)) 582 Result.VAddr = true; 583 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 584 Result.SRsrc = true; 585 if (AMDGPU::getMTBUFHasSoffset(Opc)) 586 Result.SOffset = true; 587 588 return Result; 589 } 590 591 switch (Opc) { 592 default: 593 return Result; 594 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 595 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 596 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 597 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 598 Result.SBase = true; 599 return Result; 600 case AMDGPU::DS_READ_B32: 601 case AMDGPU::DS_READ_B64: 602 case AMDGPU::DS_READ_B32_gfx9: 603 case AMDGPU::DS_READ_B64_gfx9: 604 case AMDGPU::DS_WRITE_B32: 605 case AMDGPU::DS_WRITE_B64: 606 case AMDGPU::DS_WRITE_B32_gfx9: 607 case AMDGPU::DS_WRITE_B64_gfx9: 608 Result.Addr = true; 609 return Result; 610 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: 611 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: 612 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: 613 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: 614 case AMDGPU::GLOBAL_STORE_DWORD_SADDR: 615 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR: 616 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR: 617 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR: 618 Result.SAddr = true; 619 LLVM_FALLTHROUGH; 620 case AMDGPU::GLOBAL_LOAD_DWORD: 621 case AMDGPU::GLOBAL_LOAD_DWORDX2: 622 case AMDGPU::GLOBAL_LOAD_DWORDX3: 623 case AMDGPU::GLOBAL_LOAD_DWORDX4: 624 case AMDGPU::GLOBAL_STORE_DWORD: 625 case AMDGPU::GLOBAL_STORE_DWORDX2: 626 case AMDGPU::GLOBAL_STORE_DWORDX3: 627 case AMDGPU::GLOBAL_STORE_DWORDX4: 628 case AMDGPU::FLAT_LOAD_DWORD: 629 case AMDGPU::FLAT_LOAD_DWORDX2: 630 case AMDGPU::FLAT_LOAD_DWORDX3: 631 case AMDGPU::FLAT_LOAD_DWORDX4: 632 case AMDGPU::FLAT_STORE_DWORD: 633 case AMDGPU::FLAT_STORE_DWORDX2: 634 case AMDGPU::FLAT_STORE_DWORDX3: 635 case AMDGPU::FLAT_STORE_DWORDX4: 636 Result.VAddr = true; 637 return Result; 638 } 639 } 640 641 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 642 const SILoadStoreOptimizer &LSO) { 643 I = MI; 644 unsigned Opc = MI->getOpcode(); 645 InstClass = getInstClass(Opc, *LSO.TII); 646 647 if (InstClass == UNKNOWN) 648 return; 649 650 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 651 652 switch (InstClass) { 653 case DS_READ: 654 EltSize = 655 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 656 : 4; 657 break; 658 case DS_WRITE: 659 EltSize = 660 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 661 : 4; 662 break; 663 case S_BUFFER_LOAD_IMM: 664 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 665 break; 666 default: 667 EltSize = 4; 668 break; 669 } 670 671 if (InstClass == MIMG) { 672 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 673 // Offset is not considered for MIMG instructions. 674 Offset = 0; 675 } else { 676 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 677 Offset = I->getOperand(OffsetIdx).getImm(); 678 } 679 680 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 681 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 682 683 Width = getOpcodeWidth(*I, *LSO.TII); 684 685 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 686 Offset &= 0xffff; 687 } else if (InstClass != MIMG) { 688 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 689 } 690 691 AddressRegs Regs = getRegs(Opc, *LSO.TII); 692 693 NumAddresses = 0; 694 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 695 AddrIdx[NumAddresses++] = 696 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 697 if (Regs.Addr) 698 AddrIdx[NumAddresses++] = 699 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 700 if (Regs.SBase) 701 AddrIdx[NumAddresses++] = 702 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 703 if (Regs.SRsrc) 704 AddrIdx[NumAddresses++] = 705 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 706 if (Regs.SOffset) 707 AddrIdx[NumAddresses++] = 708 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 709 if (Regs.SAddr) 710 AddrIdx[NumAddresses++] = 711 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr); 712 if (Regs.VAddr) 713 AddrIdx[NumAddresses++] = 714 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 715 if (Regs.SSamp) 716 AddrIdx[NumAddresses++] = 717 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 718 assert(NumAddresses <= MaxAddressRegs); 719 720 for (unsigned J = 0; J < NumAddresses; J++) 721 AddrReg[J] = &I->getOperand(AddrIdx[J]); 722 } 723 724 } // end anonymous namespace. 725 726 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 727 "SI Load Store Optimizer", false, false) 728 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 729 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 730 false, false) 731 732 char SILoadStoreOptimizer::ID = 0; 733 734 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 735 736 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 737 return new SILoadStoreOptimizer(); 738 } 739 740 static void addDefsUsesToList(const MachineInstr &MI, 741 DenseSet<Register> &RegDefs, 742 DenseSet<Register> &RegUses) { 743 for (const auto &Op : MI.operands()) { 744 if (!Op.isReg()) 745 continue; 746 if (Op.isDef()) 747 RegDefs.insert(Op.getReg()); 748 if (Op.readsReg()) 749 RegUses.insert(Op.getReg()); 750 } 751 } 752 753 bool SILoadStoreOptimizer::canSwapInstructions( 754 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 755 const MachineInstr &A, const MachineInstr &B) const { 756 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 757 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 758 return false; 759 for (const auto &BOp : B.operands()) { 760 if (!BOp.isReg()) 761 continue; 762 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 763 return false; 764 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 765 return false; 766 } 767 return true; 768 } 769 770 // Given that \p CI and \p Paired are adjacent memory operations produce a new 771 // MMO for the combined operation with a new access size. 772 MachineMemOperand * 773 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI, 774 const CombineInfo &Paired) { 775 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 776 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 777 778 unsigned Size = MMOa->getSize() + MMOb->getSize(); 779 780 // A base pointer for the combined operation is the same as the leading 781 // operation's pointer. 782 if (Paired < CI) 783 std::swap(MMOa, MMOb); 784 785 MachinePointerInfo PtrInfo(MMOa->getPointerInfo()); 786 // If merging FLAT and GLOBAL set address space to FLAT. 787 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) 788 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS; 789 790 MachineFunction *MF = CI.I->getMF(); 791 return MF->getMachineMemOperand(MMOa, PtrInfo, Size); 792 } 793 794 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 795 const SIInstrInfo &TII, 796 const CombineInfo &Paired) { 797 assert(CI.InstClass == MIMG); 798 799 // Ignore instructions with tfe/lwe set. 800 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 801 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 802 803 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 804 return false; 805 806 // Check other optional immediate operands for equality. 807 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 808 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 809 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 810 811 for (auto op : OperandsToMatch) { 812 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 813 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 814 return false; 815 if (Idx != -1 && 816 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 817 return false; 818 } 819 820 // Check DMask for overlaps. 821 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 822 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 823 824 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 825 if ((1u << AllowedBitsForMin) <= MinMask) 826 return false; 827 828 return true; 829 } 830 831 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 832 unsigned ComponentCount, 833 const GCNSubtarget &STI) { 834 if (ComponentCount > 4) 835 return 0; 836 837 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 838 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 839 if (!OldFormatInfo) 840 return 0; 841 842 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 843 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 844 ComponentCount, 845 OldFormatInfo->NumFormat, STI); 846 847 if (!NewFormatInfo) 848 return 0; 849 850 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 851 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 852 853 return NewFormatInfo->Format; 854 } 855 856 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 857 // highest power of two. Note that the result is well defined for all inputs 858 // including corner cases like: 859 // - if Lo == Hi, return that value 860 // - if Lo == 0, return 0 (even though the "- 1" below underflows 861 // - if Lo > Hi, return 0 (as if the range wrapped around) 862 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 863 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 864 } 865 866 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 867 const GCNSubtarget &STI, 868 CombineInfo &Paired, 869 bool Modify) { 870 assert(CI.InstClass != MIMG); 871 872 // XXX - Would the same offset be OK? Is there any reason this would happen or 873 // be useful? 874 if (CI.Offset == Paired.Offset) 875 return false; 876 877 // This won't be valid if the offset isn't aligned. 878 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 879 return false; 880 881 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 882 883 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 884 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 885 if (!Info0) 886 return false; 887 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 888 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 889 if (!Info1) 890 return false; 891 892 if (Info0->BitsPerComp != Info1->BitsPerComp || 893 Info0->NumFormat != Info1->NumFormat) 894 return false; 895 896 // TODO: Should be possible to support more formats, but if format loads 897 // are not dword-aligned, the merged load might not be valid. 898 if (Info0->BitsPerComp != 32) 899 return false; 900 901 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 902 return false; 903 } 904 905 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 906 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 907 CI.UseST64 = false; 908 CI.BaseOff = 0; 909 910 // Handle all non-DS instructions. 911 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 912 return (EltOffset0 + CI.Width == EltOffset1 || 913 EltOffset1 + Paired.Width == EltOffset0) && 914 CI.CPol == Paired.CPol; 915 } 916 917 // If the offset in elements doesn't fit in 8-bits, we might be able to use 918 // the stride 64 versions. 919 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 920 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 921 if (Modify) { 922 CI.Offset = EltOffset0 / 64; 923 Paired.Offset = EltOffset1 / 64; 924 CI.UseST64 = true; 925 } 926 return true; 927 } 928 929 // Check if the new offsets fit in the reduced 8-bit range. 930 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 931 if (Modify) { 932 CI.Offset = EltOffset0; 933 Paired.Offset = EltOffset1; 934 } 935 return true; 936 } 937 938 // Try to shift base address to decrease offsets. 939 uint32_t Min = std::min(EltOffset0, EltOffset1); 940 uint32_t Max = std::max(EltOffset0, EltOffset1); 941 942 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 943 if (((Max - Min) & ~Mask) == 0) { 944 if (Modify) { 945 // From the range of values we could use for BaseOff, choose the one that 946 // is aligned to the highest power of two, to maximise the chance that 947 // the same offset can be reused for other load/store pairs. 948 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 949 // Copy the low bits of the offsets, so that when we adjust them by 950 // subtracting BaseOff they will be multiples of 64. 951 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 952 CI.BaseOff = BaseOff * CI.EltSize; 953 CI.Offset = (EltOffset0 - BaseOff) / 64; 954 Paired.Offset = (EltOffset1 - BaseOff) / 64; 955 CI.UseST64 = true; 956 } 957 return true; 958 } 959 960 if (isUInt<8>(Max - Min)) { 961 if (Modify) { 962 // From the range of values we could use for BaseOff, choose the one that 963 // is aligned to the highest power of two, to maximise the chance that 964 // the same offset can be reused for other load/store pairs. 965 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 966 CI.BaseOff = BaseOff * CI.EltSize; 967 CI.Offset = EltOffset0 - BaseOff; 968 Paired.Offset = EltOffset1 - BaseOff; 969 } 970 return true; 971 } 972 973 return false; 974 } 975 976 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 977 const CombineInfo &CI, 978 const CombineInfo &Paired) { 979 const unsigned Width = (CI.Width + Paired.Width); 980 switch (CI.InstClass) { 981 default: 982 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 983 case S_BUFFER_LOAD_IMM: 984 switch (Width) { 985 default: 986 return false; 987 case 2: 988 case 4: 989 case 8: 990 return true; 991 } 992 } 993 } 994 995 const TargetRegisterClass * 996 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 997 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 998 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 999 } 1000 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 1001 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1002 } 1003 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 1004 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1005 } 1006 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 1007 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 1008 } 1009 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 1010 return TRI->getRegClassForReg(*MRI, Src->getReg()); 1011 } 1012 return nullptr; 1013 } 1014 1015 /// This function assumes that CI comes before Paired in a basic block. Return 1016 /// an insertion point for the merged instruction or nullptr on failure. 1017 SILoadStoreOptimizer::CombineInfo * 1018 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 1019 CombineInfo &Paired) { 1020 // If another instruction has already been merged into CI, it may now be a 1021 // type that we can't do any further merging into. 1022 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 1023 return nullptr; 1024 assert(CI.InstClass == Paired.InstClass); 1025 1026 if (getInstSubclass(CI.I->getOpcode(), *TII) != 1027 getInstSubclass(Paired.I->getOpcode(), *TII)) 1028 return nullptr; 1029 1030 // Check both offsets (or masks for MIMG) can be combined and fit in the 1031 // reduced range. 1032 if (CI.InstClass == MIMG) { 1033 if (!dmasksCanBeCombined(CI, *TII, Paired)) 1034 return nullptr; 1035 } else { 1036 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 1037 return nullptr; 1038 } 1039 1040 DenseSet<Register> RegDefs; 1041 DenseSet<Register> RegUses; 1042 CombineInfo *Where; 1043 if (CI.I->mayLoad()) { 1044 // Try to hoist Paired up to CI. 1045 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 1046 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 1047 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 1048 return nullptr; 1049 } 1050 Where = &CI; 1051 } else { 1052 // Try to sink CI down to Paired. 1053 addDefsUsesToList(*CI.I, RegDefs, RegUses); 1054 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 1055 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 1056 return nullptr; 1057 } 1058 Where = &Paired; 1059 } 1060 1061 // Call offsetsCanBeCombined with modify = true so that the offsets are 1062 // correct for the new instruction. This should return true, because 1063 // this function should only be called on CombineInfo objects that 1064 // have already been confirmed to be mergeable. 1065 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 1066 offsetsCanBeCombined(CI, *STM, Paired, true); 1067 return Where; 1068 } 1069 1070 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1071 if (STM->ldsRequiresM0Init()) 1072 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1073 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1074 } 1075 1076 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1077 if (STM->ldsRequiresM0Init()) 1078 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1079 1080 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1081 : AMDGPU::DS_READ2ST64_B64_gfx9; 1082 } 1083 1084 MachineBasicBlock::iterator 1085 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1086 MachineBasicBlock::iterator InsertBefore) { 1087 MachineBasicBlock *MBB = CI.I->getParent(); 1088 1089 // Be careful, since the addresses could be subregisters themselves in weird 1090 // cases, like vectors of pointers. 1091 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1092 1093 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1094 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1095 1096 unsigned NewOffset0 = CI.Offset; 1097 unsigned NewOffset1 = Paired.Offset; 1098 unsigned Opc = 1099 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1100 1101 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1102 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1103 1104 if (NewOffset0 > NewOffset1) { 1105 // Canonicalize the merged instruction so the smaller offset comes first. 1106 std::swap(NewOffset0, NewOffset1); 1107 std::swap(SubRegIdx0, SubRegIdx1); 1108 } 1109 1110 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1111 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1112 1113 const MCInstrDesc &Read2Desc = TII->get(Opc); 1114 1115 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1116 Register DestReg = MRI->createVirtualRegister(SuperRC); 1117 1118 DebugLoc DL = CI.I->getDebugLoc(); 1119 1120 Register BaseReg = AddrReg->getReg(); 1121 unsigned BaseSubReg = AddrReg->getSubReg(); 1122 unsigned BaseRegFlags = 0; 1123 if (CI.BaseOff) { 1124 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1125 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1126 .addImm(CI.BaseOff); 1127 1128 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1129 BaseRegFlags = RegState::Kill; 1130 1131 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1132 .addReg(ImmReg) 1133 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1134 .addImm(0); // clamp bit 1135 BaseSubReg = 0; 1136 } 1137 1138 MachineInstrBuilder Read2 = 1139 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 1140 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1141 .addImm(NewOffset0) // offset0 1142 .addImm(NewOffset1) // offset1 1143 .addImm(0) // gds 1144 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1145 1146 (void)Read2; 1147 1148 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1149 1150 // Copy to the old destination registers. 1151 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1152 .add(*Dest0) // Copy to same destination including flags and sub reg. 1153 .addReg(DestReg, 0, SubRegIdx0); 1154 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1155 .add(*Dest1) 1156 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1157 1158 CI.I->eraseFromParent(); 1159 Paired.I->eraseFromParent(); 1160 1161 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1162 return Read2; 1163 } 1164 1165 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1166 if (STM->ldsRequiresM0Init()) 1167 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1168 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1169 : AMDGPU::DS_WRITE2_B64_gfx9; 1170 } 1171 1172 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1173 if (STM->ldsRequiresM0Init()) 1174 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1175 : AMDGPU::DS_WRITE2ST64_B64; 1176 1177 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1178 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1179 } 1180 1181 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1182 CombineInfo &CI, CombineInfo &Paired, 1183 MachineBasicBlock::iterator InsertBefore) { 1184 MachineBasicBlock *MBB = CI.I->getParent(); 1185 1186 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1187 // sure we preserve the subregister index and any register flags set on them. 1188 const MachineOperand *AddrReg = 1189 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1190 const MachineOperand *Data0 = 1191 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1192 const MachineOperand *Data1 = 1193 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1194 1195 unsigned NewOffset0 = CI.Offset; 1196 unsigned NewOffset1 = Paired.Offset; 1197 unsigned Opc = 1198 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1199 1200 if (NewOffset0 > NewOffset1) { 1201 // Canonicalize the merged instruction so the smaller offset comes first. 1202 std::swap(NewOffset0, NewOffset1); 1203 std::swap(Data0, Data1); 1204 } 1205 1206 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1207 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1208 1209 const MCInstrDesc &Write2Desc = TII->get(Opc); 1210 DebugLoc DL = CI.I->getDebugLoc(); 1211 1212 Register BaseReg = AddrReg->getReg(); 1213 unsigned BaseSubReg = AddrReg->getSubReg(); 1214 unsigned BaseRegFlags = 0; 1215 if (CI.BaseOff) { 1216 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1217 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1218 .addImm(CI.BaseOff); 1219 1220 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1221 BaseRegFlags = RegState::Kill; 1222 1223 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1224 .addReg(ImmReg) 1225 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1226 .addImm(0); // clamp bit 1227 BaseSubReg = 0; 1228 } 1229 1230 MachineInstrBuilder Write2 = 1231 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1232 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1233 .add(*Data0) // data0 1234 .add(*Data1) // data1 1235 .addImm(NewOffset0) // offset0 1236 .addImm(NewOffset1) // offset1 1237 .addImm(0) // gds 1238 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1239 1240 CI.I->eraseFromParent(); 1241 Paired.I->eraseFromParent(); 1242 1243 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1244 return Write2; 1245 } 1246 1247 MachineBasicBlock::iterator 1248 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1249 MachineBasicBlock::iterator InsertBefore) { 1250 MachineBasicBlock *MBB = CI.I->getParent(); 1251 DebugLoc DL = CI.I->getDebugLoc(); 1252 const unsigned Opcode = getNewOpcode(CI, Paired); 1253 1254 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1255 1256 Register DestReg = MRI->createVirtualRegister(SuperRC); 1257 unsigned MergedDMask = CI.DMask | Paired.DMask; 1258 unsigned DMaskIdx = 1259 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1260 1261 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1262 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1263 if (I == DMaskIdx) 1264 MIB.addImm(MergedDMask); 1265 else 1266 MIB.add((*CI.I).getOperand(I)); 1267 } 1268 1269 // It shouldn't be possible to get this far if the two instructions 1270 // don't have a single memoperand, because MachineInstr::mayAlias() 1271 // will return true if this is the case. 1272 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1273 1274 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1275 1276 unsigned SubRegIdx0, SubRegIdx1; 1277 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1278 1279 // Copy to the old destination registers. 1280 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1281 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1282 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1283 1284 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1285 .add(*Dest0) // Copy to same destination including flags and sub reg. 1286 .addReg(DestReg, 0, SubRegIdx0); 1287 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1288 .add(*Dest1) 1289 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1290 1291 CI.I->eraseFromParent(); 1292 Paired.I->eraseFromParent(); 1293 return New; 1294 } 1295 1296 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1297 CombineInfo &CI, CombineInfo &Paired, 1298 MachineBasicBlock::iterator InsertBefore) { 1299 MachineBasicBlock *MBB = CI.I->getParent(); 1300 DebugLoc DL = CI.I->getDebugLoc(); 1301 const unsigned Opcode = getNewOpcode(CI, Paired); 1302 1303 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1304 1305 Register DestReg = MRI->createVirtualRegister(SuperRC); 1306 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1307 1308 // It shouldn't be possible to get this far if the two instructions 1309 // don't have a single memoperand, because MachineInstr::mayAlias() 1310 // will return true if this is the case. 1311 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1312 1313 MachineInstr *New = 1314 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1315 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1316 .addImm(MergedOffset) // offset 1317 .addImm(CI.CPol) // cpol 1318 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1319 1320 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1321 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1322 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1323 1324 // Copy to the old destination registers. 1325 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1326 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1327 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1328 1329 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1330 .add(*Dest0) // Copy to same destination including flags and sub reg. 1331 .addReg(DestReg, 0, SubRegIdx0); 1332 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1333 .add(*Dest1) 1334 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1335 1336 CI.I->eraseFromParent(); 1337 Paired.I->eraseFromParent(); 1338 return New; 1339 } 1340 1341 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1342 CombineInfo &CI, CombineInfo &Paired, 1343 MachineBasicBlock::iterator InsertBefore) { 1344 MachineBasicBlock *MBB = CI.I->getParent(); 1345 DebugLoc DL = CI.I->getDebugLoc(); 1346 1347 const unsigned Opcode = getNewOpcode(CI, Paired); 1348 1349 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1350 1351 // Copy to the new source register. 1352 Register DestReg = MRI->createVirtualRegister(SuperRC); 1353 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1354 1355 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1356 1357 AddressRegs Regs = getRegs(Opcode, *TII); 1358 1359 if (Regs.VAddr) 1360 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1361 1362 // It shouldn't be possible to get this far if the two instructions 1363 // don't have a single memoperand, because MachineInstr::mayAlias() 1364 // will return true if this is the case. 1365 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1366 1367 MachineInstr *New = 1368 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1369 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1370 .addImm(MergedOffset) // offset 1371 .addImm(CI.CPol) // cpol 1372 .addImm(0) // tfe 1373 .addImm(0) // swz 1374 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1375 1376 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1377 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1378 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1379 1380 // Copy to the old destination registers. 1381 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1382 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1383 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1384 1385 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1386 .add(*Dest0) // Copy to same destination including flags and sub reg. 1387 .addReg(DestReg, 0, SubRegIdx0); 1388 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1389 .add(*Dest1) 1390 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1391 1392 CI.I->eraseFromParent(); 1393 Paired.I->eraseFromParent(); 1394 return New; 1395 } 1396 1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1398 CombineInfo &CI, CombineInfo &Paired, 1399 MachineBasicBlock::iterator InsertBefore) { 1400 MachineBasicBlock *MBB = CI.I->getParent(); 1401 DebugLoc DL = CI.I->getDebugLoc(); 1402 1403 const unsigned Opcode = getNewOpcode(CI, Paired); 1404 1405 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1406 1407 // Copy to the new source register. 1408 Register DestReg = MRI->createVirtualRegister(SuperRC); 1409 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1410 1411 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1412 1413 AddressRegs Regs = getRegs(Opcode, *TII); 1414 1415 if (Regs.VAddr) 1416 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1417 1418 unsigned JoinedFormat = 1419 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1420 1421 // It shouldn't be possible to get this far if the two instructions 1422 // don't have a single memoperand, because MachineInstr::mayAlias() 1423 // will return true if this is the case. 1424 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1425 1426 MachineInstr *New = 1427 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1428 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1429 .addImm(MergedOffset) // offset 1430 .addImm(JoinedFormat) // format 1431 .addImm(CI.CPol) // cpol 1432 .addImm(0) // tfe 1433 .addImm(0) // swz 1434 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1435 1436 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1437 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1438 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1439 1440 // Copy to the old destination registers. 1441 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1442 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1443 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1444 1445 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1446 .add(*Dest0) // Copy to same destination including flags and sub reg. 1447 .addReg(DestReg, 0, SubRegIdx0); 1448 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1449 .add(*Dest1) 1450 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1451 1452 CI.I->eraseFromParent(); 1453 Paired.I->eraseFromParent(); 1454 return New; 1455 } 1456 1457 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1458 CombineInfo &CI, CombineInfo &Paired, 1459 MachineBasicBlock::iterator InsertBefore) { 1460 MachineBasicBlock *MBB = CI.I->getParent(); 1461 DebugLoc DL = CI.I->getDebugLoc(); 1462 1463 const unsigned Opcode = getNewOpcode(CI, Paired); 1464 1465 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1466 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1467 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1468 1469 // Copy to the new source register. 1470 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1471 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1472 1473 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1474 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1475 1476 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1477 .add(*Src0) 1478 .addImm(SubRegIdx0) 1479 .add(*Src1) 1480 .addImm(SubRegIdx1); 1481 1482 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1483 .addReg(SrcReg, RegState::Kill); 1484 1485 AddressRegs Regs = getRegs(Opcode, *TII); 1486 1487 if (Regs.VAddr) 1488 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1489 1490 unsigned JoinedFormat = 1491 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1492 1493 // It shouldn't be possible to get this far if the two instructions 1494 // don't have a single memoperand, because MachineInstr::mayAlias() 1495 // will return true if this is the case. 1496 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1497 1498 MachineInstr *New = 1499 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1500 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1501 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1502 .addImm(JoinedFormat) // format 1503 .addImm(CI.CPol) // cpol 1504 .addImm(0) // tfe 1505 .addImm(0) // swz 1506 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1507 1508 CI.I->eraseFromParent(); 1509 Paired.I->eraseFromParent(); 1510 return New; 1511 } 1512 1513 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair( 1514 CombineInfo &CI, CombineInfo &Paired, 1515 MachineBasicBlock::iterator InsertBefore) { 1516 MachineBasicBlock *MBB = CI.I->getParent(); 1517 DebugLoc DL = CI.I->getDebugLoc(); 1518 1519 const unsigned Opcode = getNewOpcode(CI, Paired); 1520 1521 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1522 Register DestReg = MRI->createVirtualRegister(SuperRC); 1523 1524 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1525 1526 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1527 MIB.add(*SAddr); 1528 1529 MachineInstr *New = 1530 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1531 .addImm(std::min(CI.Offset, Paired.Offset)) 1532 .addImm(CI.CPol) 1533 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1534 1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1538 1539 // Copy to the old destination registers. 1540 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1541 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1542 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1543 1544 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1545 .add(*Dest0) // Copy to same destination including flags and sub reg. 1546 .addReg(DestReg, 0, SubRegIdx0); 1547 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1548 .add(*Dest1) 1549 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1550 1551 CI.I->eraseFromParent(); 1552 Paired.I->eraseFromParent(); 1553 return New; 1554 } 1555 1556 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( 1557 CombineInfo &CI, CombineInfo &Paired, 1558 MachineBasicBlock::iterator InsertBefore) { 1559 MachineBasicBlock *MBB = CI.I->getParent(); 1560 DebugLoc DL = CI.I->getDebugLoc(); 1561 1562 const unsigned Opcode = getNewOpcode(CI, Paired); 1563 1564 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1565 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1566 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1567 1568 // Copy to the new source register. 1569 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1570 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1571 1572 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1573 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1574 1575 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1576 .add(*Src0) 1577 .addImm(SubRegIdx0) 1578 .add(*Src1) 1579 .addImm(SubRegIdx1); 1580 1581 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1582 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)) 1583 .addReg(SrcReg, RegState::Kill); 1584 1585 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr)) 1586 MIB.add(*SAddr); 1587 1588 MachineInstr *New = 1589 MIB.addImm(std::min(CI.Offset, Paired.Offset)) 1590 .addImm(CI.CPol) 1591 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1592 1593 CI.I->eraseFromParent(); 1594 Paired.I->eraseFromParent(); 1595 return New; 1596 } 1597 1598 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1599 const CombineInfo &Paired) { 1600 const unsigned Width = CI.Width + Paired.Width; 1601 1602 switch (getCommonInstClass(CI, Paired)) { 1603 default: 1604 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1605 // FIXME: Handle d16 correctly 1606 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1607 Width); 1608 case TBUFFER_LOAD: 1609 case TBUFFER_STORE: 1610 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1611 Width); 1612 1613 case UNKNOWN: 1614 llvm_unreachable("Unknown instruction class"); 1615 case S_BUFFER_LOAD_IMM: 1616 switch (Width) { 1617 default: 1618 return 0; 1619 case 2: 1620 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1621 case 4: 1622 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1623 case 8: 1624 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1625 } 1626 case GLOBAL_LOAD: 1627 switch (Width) { 1628 default: 1629 return 0; 1630 case 2: 1631 return AMDGPU::GLOBAL_LOAD_DWORDX2; 1632 case 3: 1633 return AMDGPU::GLOBAL_LOAD_DWORDX3; 1634 case 4: 1635 return AMDGPU::GLOBAL_LOAD_DWORDX4; 1636 } 1637 case GLOBAL_LOAD_SADDR: 1638 switch (Width) { 1639 default: 1640 return 0; 1641 case 2: 1642 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR; 1643 case 3: 1644 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR; 1645 case 4: 1646 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR; 1647 } 1648 case GLOBAL_STORE: 1649 switch (Width) { 1650 default: 1651 return 0; 1652 case 2: 1653 return AMDGPU::GLOBAL_STORE_DWORDX2; 1654 case 3: 1655 return AMDGPU::GLOBAL_STORE_DWORDX3; 1656 case 4: 1657 return AMDGPU::GLOBAL_STORE_DWORDX4; 1658 } 1659 case GLOBAL_STORE_SADDR: 1660 switch (Width) { 1661 default: 1662 return 0; 1663 case 2: 1664 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR; 1665 case 3: 1666 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR; 1667 case 4: 1668 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR; 1669 } 1670 case FLAT_LOAD: 1671 switch (Width) { 1672 default: 1673 return 0; 1674 case 2: 1675 return AMDGPU::FLAT_LOAD_DWORDX2; 1676 case 3: 1677 return AMDGPU::FLAT_LOAD_DWORDX3; 1678 case 4: 1679 return AMDGPU::FLAT_LOAD_DWORDX4; 1680 } 1681 case FLAT_STORE: 1682 switch (Width) { 1683 default: 1684 return 0; 1685 case 2: 1686 return AMDGPU::FLAT_STORE_DWORDX2; 1687 case 3: 1688 return AMDGPU::FLAT_STORE_DWORDX3; 1689 case 4: 1690 return AMDGPU::FLAT_STORE_DWORDX4; 1691 } 1692 case MIMG: 1693 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1694 "No overlaps"); 1695 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1696 } 1697 } 1698 1699 std::pair<unsigned, unsigned> 1700 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1701 const CombineInfo &Paired) { 1702 assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) == 1703 CI.Width + Paired.Width)) && 1704 "No overlaps"); 1705 1706 unsigned Idx0; 1707 unsigned Idx1; 1708 1709 static const unsigned Idxs[5][4] = { 1710 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1711 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1712 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1713 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1714 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1715 }; 1716 1717 assert(CI.Width >= 1 && CI.Width <= 4); 1718 assert(Paired.Width >= 1 && Paired.Width <= 4); 1719 1720 if (Paired < CI) { 1721 Idx1 = Idxs[0][Paired.Width - 1]; 1722 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1723 } else { 1724 Idx0 = Idxs[0][CI.Width - 1]; 1725 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1726 } 1727 1728 return std::make_pair(Idx0, Idx1); 1729 } 1730 1731 const TargetRegisterClass * 1732 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1733 const CombineInfo &Paired) { 1734 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1735 switch (CI.Width + Paired.Width) { 1736 default: 1737 return nullptr; 1738 case 2: 1739 return &AMDGPU::SReg_64_XEXECRegClass; 1740 case 4: 1741 return &AMDGPU::SGPR_128RegClass; 1742 case 8: 1743 return &AMDGPU::SGPR_256RegClass; 1744 case 16: 1745 return &AMDGPU::SGPR_512RegClass; 1746 } 1747 } 1748 1749 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1750 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1751 ? TRI->getAGPRClassForBitWidth(BitWidth) 1752 : TRI->getVGPRClassForBitWidth(BitWidth); 1753 } 1754 1755 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1756 CombineInfo &CI, CombineInfo &Paired, 1757 MachineBasicBlock::iterator InsertBefore) { 1758 MachineBasicBlock *MBB = CI.I->getParent(); 1759 DebugLoc DL = CI.I->getDebugLoc(); 1760 1761 const unsigned Opcode = getNewOpcode(CI, Paired); 1762 1763 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1764 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1765 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1766 1767 // Copy to the new source register. 1768 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1769 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1770 1771 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1772 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1773 1774 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1775 .add(*Src0) 1776 .addImm(SubRegIdx0) 1777 .add(*Src1) 1778 .addImm(SubRegIdx1); 1779 1780 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1781 .addReg(SrcReg, RegState::Kill); 1782 1783 AddressRegs Regs = getRegs(Opcode, *TII); 1784 1785 if (Regs.VAddr) 1786 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1787 1788 1789 // It shouldn't be possible to get this far if the two instructions 1790 // don't have a single memoperand, because MachineInstr::mayAlias() 1791 // will return true if this is the case. 1792 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1793 1794 MachineInstr *New = 1795 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1796 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1797 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1798 .addImm(CI.CPol) // cpol 1799 .addImm(0) // tfe 1800 .addImm(0) // swz 1801 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); 1802 1803 CI.I->eraseFromParent(); 1804 Paired.I->eraseFromParent(); 1805 return New; 1806 } 1807 1808 MachineOperand 1809 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1810 APInt V(32, Val, true); 1811 if (TII->isInlineConstant(V)) 1812 return MachineOperand::CreateImm(Val); 1813 1814 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1815 MachineInstr *Mov = 1816 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1817 TII->get(AMDGPU::S_MOV_B32), Reg) 1818 .addImm(Val); 1819 (void)Mov; 1820 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1821 return MachineOperand::CreateReg(Reg, false); 1822 } 1823 1824 // Compute base address using Addr and return the final register. 1825 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1826 const MemAddress &Addr) const { 1827 MachineBasicBlock *MBB = MI.getParent(); 1828 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1829 DebugLoc DL = MI.getDebugLoc(); 1830 1831 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1832 Addr.Base.LoSubReg) && 1833 "Expected 32-bit Base-Register-Low!!"); 1834 1835 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1836 Addr.Base.HiSubReg) && 1837 "Expected 32-bit Base-Register-Hi!!"); 1838 1839 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1840 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1841 MachineOperand OffsetHi = 1842 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1843 1844 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1845 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1846 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1847 1848 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1849 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1850 MachineInstr *LoHalf = 1851 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1852 .addReg(CarryReg, RegState::Define) 1853 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1854 .add(OffsetLo) 1855 .addImm(0); // clamp bit 1856 (void)LoHalf; 1857 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1858 1859 MachineInstr *HiHalf = 1860 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1861 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1862 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1863 .add(OffsetHi) 1864 .addReg(CarryReg, RegState::Kill) 1865 .addImm(0); // clamp bit 1866 (void)HiHalf; 1867 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1868 1869 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1870 MachineInstr *FullBase = 1871 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1872 .addReg(DestSub0) 1873 .addImm(AMDGPU::sub0) 1874 .addReg(DestSub1) 1875 .addImm(AMDGPU::sub1); 1876 (void)FullBase; 1877 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1878 1879 return FullDestReg; 1880 } 1881 1882 // Update base and offset with the NewBase and NewOffset in MI. 1883 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1884 Register NewBase, 1885 int32_t NewOffset) const { 1886 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1887 Base->setReg(NewBase); 1888 Base->setIsKill(false); 1889 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1890 } 1891 1892 Optional<int32_t> 1893 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1894 if (Op.isImm()) 1895 return Op.getImm(); 1896 1897 if (!Op.isReg()) 1898 return None; 1899 1900 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1901 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1902 !Def->getOperand(1).isImm()) 1903 return None; 1904 1905 return Def->getOperand(1).getImm(); 1906 } 1907 1908 // Analyze Base and extracts: 1909 // - 32bit base registers, subregisters 1910 // - 64bit constant offset 1911 // Expecting base computation as: 1912 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1913 // %LO:vgpr_32, %c:sreg_64_xexec = 1914 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1915 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1916 // %Base:vreg_64 = 1917 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1918 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1919 MemAddress &Addr) const { 1920 if (!Base.isReg()) 1921 return; 1922 1923 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1924 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1925 || Def->getNumOperands() != 5) 1926 return; 1927 1928 MachineOperand BaseLo = Def->getOperand(1); 1929 MachineOperand BaseHi = Def->getOperand(3); 1930 if (!BaseLo.isReg() || !BaseHi.isReg()) 1931 return; 1932 1933 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1934 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1935 1936 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1937 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1938 return; 1939 1940 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1941 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1942 1943 auto Offset0P = extractConstOffset(*Src0); 1944 if (Offset0P) 1945 BaseLo = *Src1; 1946 else { 1947 if (!(Offset0P = extractConstOffset(*Src1))) 1948 return; 1949 BaseLo = *Src0; 1950 } 1951 1952 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1953 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1954 1955 if (Src0->isImm()) 1956 std::swap(Src0, Src1); 1957 1958 if (!Src1->isImm()) 1959 return; 1960 1961 uint64_t Offset1 = Src1->getImm(); 1962 BaseHi = *Src0; 1963 1964 Addr.Base.LoReg = BaseLo.getReg(); 1965 Addr.Base.HiReg = BaseHi.getReg(); 1966 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1967 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1968 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1969 } 1970 1971 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1972 MachineInstr &MI, 1973 MemInfoMap &Visited, 1974 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1975 1976 if (!(MI.mayLoad() ^ MI.mayStore())) 1977 return false; 1978 1979 // TODO: Support flat and scratch. 1980 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1981 return false; 1982 1983 if (MI.mayLoad() && 1984 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1985 return false; 1986 1987 if (AnchorList.count(&MI)) 1988 return false; 1989 1990 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1991 1992 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1993 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1994 return false; 1995 } 1996 1997 // Step1: Find the base-registers and a 64bit constant offset. 1998 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1999 MemAddress MAddr; 2000 if (Visited.find(&MI) == Visited.end()) { 2001 processBaseWithConstOffset(Base, MAddr); 2002 Visited[&MI] = MAddr; 2003 } else 2004 MAddr = Visited[&MI]; 2005 2006 if (MAddr.Offset == 0) { 2007 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 2008 " constant offsets that can be promoted.\n";); 2009 return false; 2010 } 2011 2012 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 2013 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 2014 2015 // Step2: Traverse through MI's basic block and find an anchor(that has the 2016 // same base-registers) with the highest 13bit distance from MI's offset. 2017 // E.g. (64bit loads) 2018 // bb: 2019 // addr1 = &a + 4096; load1 = load(addr1, 0) 2020 // addr2 = &a + 6144; load2 = load(addr2, 0) 2021 // addr3 = &a + 8192; load3 = load(addr3, 0) 2022 // addr4 = &a + 10240; load4 = load(addr4, 0) 2023 // addr5 = &a + 12288; load5 = load(addr5, 0) 2024 // 2025 // Starting from the first load, the optimization will try to find a new base 2026 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2027 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2028 // as the new-base(anchor) because of the maximum distance which can 2029 // accommodate more intermediate bases presumably. 2030 // 2031 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 2032 // (&a + 8192) for load1, load2, load4. 2033 // addr = &a + 8192 2034 // load1 = load(addr, -4096) 2035 // load2 = load(addr, -2048) 2036 // load3 = load(addr, 0) 2037 // load4 = load(addr, 2048) 2038 // addr5 = &a + 12288; load5 = load(addr5, 0) 2039 // 2040 MachineInstr *AnchorInst = nullptr; 2041 MemAddress AnchorAddr; 2042 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 2043 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 2044 2045 MachineBasicBlock *MBB = MI.getParent(); 2046 MachineBasicBlock::iterator E = MBB->end(); 2047 MachineBasicBlock::iterator MBBI = MI.getIterator(); 2048 ++MBBI; 2049 const SITargetLowering *TLI = 2050 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 2051 2052 for ( ; MBBI != E; ++MBBI) { 2053 MachineInstr &MINext = *MBBI; 2054 // TODO: Support finding an anchor(with same base) from store addresses or 2055 // any other load addresses where the opcodes are different. 2056 if (MINext.getOpcode() != MI.getOpcode() || 2057 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 2058 continue; 2059 2060 const MachineOperand &BaseNext = 2061 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 2062 MemAddress MAddrNext; 2063 if (Visited.find(&MINext) == Visited.end()) { 2064 processBaseWithConstOffset(BaseNext, MAddrNext); 2065 Visited[&MINext] = MAddrNext; 2066 } else 2067 MAddrNext = Visited[&MINext]; 2068 2069 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 2070 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 2071 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 2072 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 2073 continue; 2074 2075 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 2076 2077 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 2078 TargetLoweringBase::AddrMode AM; 2079 AM.HasBaseReg = true; 2080 AM.BaseOffs = Dist; 2081 if (TLI->isLegalGlobalAddressingMode(AM) && 2082 (uint32_t)std::abs(Dist) > MaxDist) { 2083 MaxDist = std::abs(Dist); 2084 2085 AnchorAddr = MAddrNext; 2086 AnchorInst = &MINext; 2087 } 2088 } 2089 2090 if (AnchorInst) { 2091 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 2092 AnchorInst->dump()); 2093 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 2094 << AnchorAddr.Offset << "\n\n"); 2095 2096 // Instead of moving up, just re-compute anchor-instruction's base address. 2097 Register Base = computeBase(MI, AnchorAddr); 2098 2099 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 2100 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 2101 2102 for (auto P : InstsWCommonBase) { 2103 TargetLoweringBase::AddrMode AM; 2104 AM.HasBaseReg = true; 2105 AM.BaseOffs = P.second - AnchorAddr.Offset; 2106 2107 if (TLI->isLegalGlobalAddressingMode(AM)) { 2108 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 2109 dbgs() << ")"; P.first->dump()); 2110 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 2111 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 2112 } 2113 } 2114 AnchorList.insert(AnchorInst); 2115 return true; 2116 } 2117 2118 return false; 2119 } 2120 2121 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 2122 std::list<std::list<CombineInfo> > &MergeableInsts) const { 2123 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 2124 if (AddrList.front().InstClass == CI.InstClass && 2125 AddrList.front().IsAGPR == CI.IsAGPR && 2126 AddrList.front().hasSameBaseAddress(*CI.I)) { 2127 AddrList.emplace_back(CI); 2128 return; 2129 } 2130 } 2131 2132 // Base address not found, so add a new list. 2133 MergeableInsts.emplace_back(1, CI); 2134 } 2135 2136 std::pair<MachineBasicBlock::iterator, bool> 2137 SILoadStoreOptimizer::collectMergeableInsts( 2138 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2139 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2140 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2141 bool Modified = false; 2142 2143 // Sort potential mergeable instructions into lists. One list per base address. 2144 unsigned Order = 0; 2145 MachineBasicBlock::iterator BlockI = Begin; 2146 for (; BlockI != End; ++BlockI) { 2147 MachineInstr &MI = *BlockI; 2148 2149 // We run this before checking if an address is mergeable, because it can produce 2150 // better code even if the instructions aren't mergeable. 2151 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2152 Modified = true; 2153 2154 // Treat volatile accesses, ordered accesses and unmodeled side effects as 2155 // barriers. We can look after this barrier for separate merges. 2156 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 2157 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 2158 2159 // Search will resume after this instruction in a separate merge list. 2160 ++BlockI; 2161 break; 2162 } 2163 2164 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2165 if (InstClass == UNKNOWN) 2166 continue; 2167 2168 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2169 int Swizzled = 2170 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2171 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2172 continue; 2173 2174 CombineInfo CI; 2175 CI.setMI(MI, *this); 2176 CI.Order = Order++; 2177 2178 if (!CI.hasMergeableAddress(*MRI)) 2179 continue; 2180 2181 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 2182 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2183 // operands. However we are reporting that ds_write2 shall have 2184 // only VGPR data so that machine copy propagation does not 2185 // create an illegal instruction with a VGPR and AGPR sources. 2186 // Consequenctially if we create such instruction the verifier 2187 // will complain. 2188 continue; 2189 } 2190 2191 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2192 2193 addInstToMergeableList(CI, MergeableInsts); 2194 } 2195 2196 // At this point we have lists of Mergeable instructions. 2197 // 2198 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2199 // list try to find an instruction that can be merged with I. If an instruction 2200 // is found, it is stored in the Paired field. If no instructions are found, then 2201 // the CombineInfo object is deleted from the list. 2202 2203 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2204 E = MergeableInsts.end(); I != E;) { 2205 2206 std::list<CombineInfo> &MergeList = *I; 2207 if (MergeList.size() <= 1) { 2208 // This means we have found only one instruction with a given address 2209 // that can be merged, and we need at least 2 instructions to do a merge, 2210 // so this list can be discarded. 2211 I = MergeableInsts.erase(I); 2212 continue; 2213 } 2214 2215 // Sort the lists by offsets, this way mergeable instructions will be 2216 // adjacent to each other in the list, which will make it easier to find 2217 // matches. 2218 MergeList.sort( 2219 [] (const CombineInfo &A, const CombineInfo &B) { 2220 return A.Offset < B.Offset; 2221 }); 2222 ++I; 2223 } 2224 2225 return std::make_pair(BlockI, Modified); 2226 } 2227 2228 // Scan through looking for adjacent LDS operations with constant offsets from 2229 // the same base register. We rely on the scheduler to do the hard work of 2230 // clustering nearby loads, and assume these are all adjacent. 2231 bool SILoadStoreOptimizer::optimizeBlock( 2232 std::list<std::list<CombineInfo> > &MergeableInsts) { 2233 bool Modified = false; 2234 2235 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2236 E = MergeableInsts.end(); I != E;) { 2237 std::list<CombineInfo> &MergeList = *I; 2238 2239 bool OptimizeListAgain = false; 2240 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2241 // We weren't able to make any changes, so delete the list so we don't 2242 // process the same instructions the next time we try to optimize this 2243 // block. 2244 I = MergeableInsts.erase(I); 2245 continue; 2246 } 2247 2248 Modified = true; 2249 2250 // We made changes, but also determined that there were no more optimization 2251 // opportunities, so we don't need to reprocess the list 2252 if (!OptimizeListAgain) { 2253 I = MergeableInsts.erase(I); 2254 continue; 2255 } 2256 OptimizeAgain = true; 2257 } 2258 return Modified; 2259 } 2260 2261 bool 2262 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2263 std::list<CombineInfo> &MergeList, 2264 bool &OptimizeListAgain) { 2265 if (MergeList.empty()) 2266 return false; 2267 2268 bool Modified = false; 2269 2270 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2271 Next = std::next(I)) { 2272 2273 auto First = I; 2274 auto Second = Next; 2275 2276 if ((*First).Order > (*Second).Order) 2277 std::swap(First, Second); 2278 CombineInfo &CI = *First; 2279 CombineInfo &Paired = *Second; 2280 2281 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 2282 if (!Where) { 2283 ++I; 2284 continue; 2285 } 2286 2287 Modified = true; 2288 2289 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2290 2291 MachineBasicBlock::iterator NewMI; 2292 switch (CI.InstClass) { 2293 default: 2294 llvm_unreachable("unknown InstClass"); 2295 break; 2296 case DS_READ: 2297 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2298 break; 2299 case DS_WRITE: 2300 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2301 break; 2302 case S_BUFFER_LOAD_IMM: 2303 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2304 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2305 break; 2306 case BUFFER_LOAD: 2307 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2308 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2309 break; 2310 case BUFFER_STORE: 2311 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2312 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2313 break; 2314 case MIMG: 2315 NewMI = mergeImagePair(CI, Paired, Where->I); 2316 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2317 break; 2318 case TBUFFER_LOAD: 2319 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2320 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2321 break; 2322 case TBUFFER_STORE: 2323 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2324 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2325 break; 2326 case FLAT_LOAD: 2327 case GLOBAL_LOAD: 2328 case GLOBAL_LOAD_SADDR: 2329 NewMI = mergeFlatLoadPair(CI, Paired, Where->I); 2330 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2331 break; 2332 case FLAT_STORE: 2333 case GLOBAL_STORE: 2334 case GLOBAL_STORE_SADDR: 2335 NewMI = mergeFlatStorePair(CI, Paired, Where->I); 2336 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2337 break; 2338 } 2339 CI.setMI(NewMI, *this); 2340 CI.Order = Where->Order; 2341 if (I == Second) 2342 I = Next; 2343 2344 MergeList.erase(Second); 2345 } 2346 2347 return Modified; 2348 } 2349 2350 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2351 if (skipFunction(MF.getFunction())) 2352 return false; 2353 2354 STM = &MF.getSubtarget<GCNSubtarget>(); 2355 if (!STM->loadStoreOptEnabled()) 2356 return false; 2357 2358 TII = STM->getInstrInfo(); 2359 TRI = &TII->getRegisterInfo(); 2360 2361 MRI = &MF.getRegInfo(); 2362 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2363 2364 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2365 2366 bool Modified = false; 2367 2368 // Contains the list of instructions for which constant offsets are being 2369 // promoted to the IMM. This is tracked for an entire block at time. 2370 SmallPtrSet<MachineInstr *, 4> AnchorList; 2371 MemInfoMap Visited; 2372 2373 for (MachineBasicBlock &MBB : MF) { 2374 MachineBasicBlock::iterator SectionEnd; 2375 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2376 I = SectionEnd) { 2377 bool CollectModified; 2378 std::list<std::list<CombineInfo>> MergeableInsts; 2379 2380 // First pass: Collect list of all instructions we know how to merge in a 2381 // subset of the block. 2382 std::tie(SectionEnd, CollectModified) = 2383 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2384 2385 Modified |= CollectModified; 2386 2387 do { 2388 OptimizeAgain = false; 2389 Modified |= optimizeBlock(MergeableInsts); 2390 } while (OptimizeAgain); 2391 } 2392 2393 Visited.clear(); 2394 AnchorList.clear(); 2395 } 2396 2397 return Modified; 2398 } 2399