1 //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Merge the offset of address calculation into the offset field 10 // of instructions in a global address lowering sequence. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "LoongArch.h" 15 #include "LoongArchTargetMachine.h" 16 #include "llvm/CodeGen/MachineFunctionPass.h" 17 #include "llvm/CodeGen/Passes.h" 18 #include "llvm/MC/TargetRegistry.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Target/TargetOptions.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "loongarch-merge-base-offset" 26 #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset" 27 28 namespace { 29 30 class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass { 31 const LoongArchSubtarget *ST = nullptr; 32 MachineRegisterInfo *MRI; 33 34 public: 35 static char ID; 36 bool runOnMachineFunction(MachineFunction &Fn) override; 37 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12, 38 MachineInstr *&Lo20, MachineInstr *&Hi12, 39 MachineInstr *&Last); 40 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add, 41 MachineInstr *&Lo12); 42 43 bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12, 44 MachineInstr *&Lo20, MachineInstr *&Hi12, 45 MachineInstr *&Last); 46 void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, 47 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, 48 int64_t Offset); 49 bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12, 50 MachineInstr *&Lo20, MachineInstr *&Hi12, 51 MachineInstr *&Last, MachineInstr &TailAdd, 52 Register GAReg); 53 54 bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12, 55 MachineInstr *&Lo20, MachineInstr *&Hi12, 56 MachineInstr *&Last); 57 58 LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {} 59 60 MachineFunctionProperties getRequiredProperties() const override { 61 return MachineFunctionProperties().setIsSSA(); 62 } 63 64 void getAnalysisUsage(AnalysisUsage &AU) const override { 65 AU.setPreservesCFG(); 66 MachineFunctionPass::getAnalysisUsage(AU); 67 } 68 69 StringRef getPassName() const override { 70 return LoongArch_MERGE_BASE_OFFSET_NAME; 71 } 72 }; 73 } // end anonymous namespace 74 75 char LoongArchMergeBaseOffsetOpt::ID = 0; 76 INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE, 77 LoongArch_MERGE_BASE_OFFSET_NAME, false, false) 78 79 // Detect either of the patterns: 80 // 81 // 1. (small/medium): 82 // pcalau12i vreg1, %pc_hi20(s) 83 // addi.d vreg2, vreg1, %pc_lo12(s) 84 // 85 // 2. (large): 86 // pcalau12i vreg1, %pc_hi20(s) 87 // addi.d vreg2, $zero, %pc_lo12(s) 88 // lu32i.d vreg3, vreg2, %pc64_lo20(s) 89 // lu52i.d vreg4, vreg3, %pc64_hi12(s) 90 // add.d vreg5, vreg4, vreg1 91 92 // The pattern is only accepted if: 93 // 1) For small and medium pattern, the first instruction has only one use, 94 // which is the ADDI. 95 // 2) For large pattern, the first four instructions each have only one use, 96 // and the user of the fourth instruction is ADD. 97 // 3) The address operands have the appropriate type, reflecting the 98 // lowering of a global address or constant pool using the pattern. 99 // 4) The offset value in the Global Address or Constant Pool is 0. 100 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, 101 MachineInstr *&Lo12, 102 MachineInstr *&Lo20, 103 MachineInstr *&Hi12, 104 MachineInstr *&Last) { 105 if (Hi20.getOpcode() != LoongArch::PCALAU12I) 106 return false; 107 108 const MachineOperand &Hi20Op1 = Hi20.getOperand(1); 109 if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI) 110 return false; 111 112 auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) { 113 return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress(); 114 }; 115 116 if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0) 117 return false; 118 119 Register HiDestReg = Hi20.getOperand(0).getReg(); 120 if (!MRI->hasOneUse(HiDestReg)) 121 return false; 122 123 MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg); 124 if (UseInst->getOpcode() != LoongArch::ADD_D) { 125 Lo12 = UseInst; 126 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || 127 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) 128 return false; 129 } else { 130 assert(ST->is64Bit()); 131 Last = UseInst; 132 133 Register LastOp1Reg = Last->getOperand(1).getReg(); 134 if (!LastOp1Reg.isVirtual()) 135 return false; 136 Hi12 = MRI->getVRegDef(LastOp1Reg); 137 const MachineOperand &Hi12Op2 = Hi12->getOperand(2); 138 if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI) 139 return false; 140 if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0) 141 return false; 142 if (!MRI->hasOneUse(Hi12->getOperand(0).getReg())) 143 return false; 144 145 Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg()); 146 const MachineOperand &Lo20Op2 = Lo20->getOperand(2); 147 if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO) 148 return false; 149 if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0) 150 return false; 151 if (!MRI->hasOneUse(Lo20->getOperand(0).getReg())) 152 return false; 153 154 Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg()); 155 if (!MRI->hasOneUse(Lo12->getOperand(0).getReg())) 156 return false; 157 } 158 159 const MachineOperand &Lo12Op2 = Lo12->getOperand(2); 160 assert(Hi20.getOpcode() == LoongArch::PCALAU12I); 161 if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO || 162 !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) || 163 Lo12Op2.getOffset() != 0) 164 return false; 165 166 if (Hi20Op1.isGlobal()) { 167 LLVM_DEBUG(dbgs() << " Found lowered global address: " 168 << *Hi20Op1.getGlobal() << "\n"); 169 } else if (Hi20Op1.isBlockAddress()) { 170 LLVM_DEBUG(dbgs() << " Found lowered basic address: " 171 << *Hi20Op1.getBlockAddress() << "\n"); 172 } else if (Hi20Op1.isCPI()) { 173 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() 174 << "\n"); 175 } 176 177 return true; 178 } 179 180 // Detect the pattern: 181 // 182 // (small/medium): 183 // lu12i.w vreg1, %le_hi20_r(s) 184 // add.w/d vreg2, vreg1, r2, %le_add_r(s) 185 // addi.w/d vreg3, vreg2, %le_lo12_r(s) 186 187 // The pattern is only accepted if: 188 // 1) The first instruction has only one use, which is the PseudoAddTPRel. 189 // The second instruction has only one use, which is the ADDI. The 190 // second instruction's last operand is the tp register. 191 // 2) The address operands have the appropriate type, reflecting the 192 // lowering of a thread_local global address using the pattern. 193 // 3) The offset value in the ThreadLocal Global Address is 0. 194 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20, 195 MachineInstr *&Add, 196 MachineInstr *&Lo12) { 197 if (Hi20.getOpcode() != LoongArch::LU12I_W) 198 return false; 199 200 auto isGlobalOrCPI = [](const MachineOperand &Op) { 201 return Op.isGlobal() || Op.isCPI(); 202 }; 203 204 const MachineOperand &Hi20Op1 = Hi20.getOperand(1); 205 if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R || 206 !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0) 207 return false; 208 209 Register HiDestReg = Hi20.getOperand(0).getReg(); 210 if (!MRI->hasOneUse(HiDestReg)) 211 return false; 212 213 Add = &*MRI->use_instr_begin(HiDestReg); 214 if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) || 215 (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W)) 216 return false; 217 218 if (Add->getOperand(2).getReg() != LoongArch::R2) 219 return false; 220 221 const MachineOperand &AddOp3 = Add->getOperand(3); 222 if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R || 223 !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) || 224 AddOp3.getOffset() != 0) 225 return false; 226 227 Register AddDestReg = Add->getOperand(0).getReg(); 228 if (!MRI->hasOneUse(AddDestReg)) 229 return false; 230 231 Lo12 = &*MRI->use_instr_begin(AddDestReg); 232 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) || 233 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W)) 234 return false; 235 236 const MachineOperand &Lo12Op2 = Lo12->getOperand(2); 237 if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R || 238 !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) || 239 Lo12Op2.getOffset() != 0) 240 return false; 241 242 if (Hi20Op1.isGlobal()) { 243 LLVM_DEBUG(dbgs() << " Found lowered global address: " 244 << *Hi20Op1.getGlobal() << "\n"); 245 } else if (Hi20Op1.isCPI()) { 246 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex() 247 << "\n"); 248 } 249 250 return true; 251 } 252 253 // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions. 254 // Delete the tail instruction and update all the uses to use the 255 // output from Last. 256 void LoongArchMergeBaseOffsetOpt::foldOffset( 257 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, 258 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail, 259 int64_t Offset) { 260 // Put the offset back in Hi and the Lo 261 Hi20.getOperand(1).setOffset(Offset); 262 Lo12.getOperand(2).setOffset(Offset); 263 if (Lo20 && Hi12) { 264 Lo20->getOperand(2).setOffset(Offset); 265 Hi12->getOperand(2).setOffset(Offset); 266 } 267 268 // For tls-le, offset of the second PseudoAddTPRel instr should also be 269 // updated. 270 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); 271 if (Hi20.getOpcode() == LoongArch::LU12I_W) 272 Add->getOperand(3).setOffset(Offset); 273 274 // Delete the tail instruction. 275 MachineInstr *Def = Last ? Last : &Lo12; 276 MRI->constrainRegClass(Def->getOperand(0).getReg(), 277 MRI->getRegClass(Tail.getOperand(0).getReg())); 278 MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg()); 279 Tail.eraseFromParent(); 280 281 LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n" 282 << " " << Hi20;); 283 if (Hi20.getOpcode() == LoongArch::LU12I_W) { 284 LLVM_DEBUG(dbgs() << " " << *Add;); 285 } 286 LLVM_DEBUG(dbgs() << " " << Lo12;); 287 if (Lo20 && Hi12) { 288 LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;); 289 } 290 } 291 292 // Detect patterns for large offsets that are passed into an ADD instruction. 293 // If the pattern is found, updates the offset in Hi20, (Add), Lo12, 294 // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that 295 // produced the offset. 296 // 297 // (The instructions marked with "!" are not necessarily present) 298 // 299 // Base address lowering is of the form: 300 // 1) pcala: 301 // Hi20: pcalau12i vreg1, %pc_hi20(s) 302 // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s) 303 // | Lo20: lu32i.d vreg2, %pc64_lo20(s) ! 304 // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) ! 305 // | 306 // | 2) tls-le: 307 // | Hi20: lu12i.w vreg1, %le_hi20_r(s) 308 // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s) 309 // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s) 310 // | 311 // | The large offset can be one of the forms: 312 // | 313 // +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits: 314 // | OffsetHi20: lu12i.w vreg3, 4 315 // | OffsetLo12: ori voff, vreg3, 188 ------------------+ 316 // | | 317 // +-> 2) Offset that has non zero bits in Hi20 bits only: | 318 // | OffsetHi20: lu12i.w voff, 128 ------------------+ 319 // | | 320 // +-> 3) Offset that has non zero bits in Lo20 bits: | 321 // | OffsetHi20: lu12i.w vreg3, 121 ! | 322 // | OffsetLo12: ori voff, vreg3, 122 ! | 323 // | OffsetLo20: lu32i.d voff, 123 ------------------+ 324 // +-> 4) Offset that has non zero bits in Hi12 bits: | 325 // OffsetHi20: lu12i.w vreg3, 121 ! | 326 // OffsetLo12: ori voff, vreg3, 122 ! | 327 // OffsetLo20: lu32i.d vreg3, 123 ! | 328 // OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+ 329 // | 330 // TailAdd: add.d vreg4, vreg2, voff <------------------+ 331 // 332 bool LoongArchMergeBaseOffsetOpt::foldLargeOffset( 333 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20, 334 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd, 335 Register GAReg) { 336 assert((TailAdd.getOpcode() == LoongArch::ADD_W || 337 TailAdd.getOpcode() == LoongArch::ADD_D) && 338 "Expected ADD instruction!"); 339 Register Rs = TailAdd.getOperand(1).getReg(); 340 Register Rt = TailAdd.getOperand(2).getReg(); 341 Register Reg = Rs == GAReg ? Rt : Rs; 342 SmallVector<MachineInstr *, 4> Instrs; 343 int64_t Offset = 0; 344 int64_t Mask = -1; 345 346 // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]: 347 for (int i = 0; i < 4; i++) { 348 // Handle Reg is R0. 349 if (Reg == LoongArch::R0) 350 break; 351 352 // Can't fold if the register has more than one use. 353 if (!Reg.isVirtual() || !MRI->hasOneUse(Reg)) 354 return false; 355 356 MachineInstr *Curr = MRI->getVRegDef(Reg); 357 if (!Curr) 358 break; 359 360 switch (Curr->getOpcode()) { 361 default: 362 // Can't fold if the instruction opcode is unexpected. 363 return false; 364 case LoongArch::ORI: { 365 MachineOperand ImmOp = Curr->getOperand(2); 366 if (ImmOp.getTargetFlags() != LoongArchII::MO_None) 367 return false; 368 Offset += ImmOp.getImm(); 369 Reg = Curr->getOperand(1).getReg(); 370 Instrs.push_back(Curr); 371 break; 372 } 373 case LoongArch::LU12I_W: { 374 MachineOperand ImmOp = Curr->getOperand(1); 375 if (ImmOp.getTargetFlags() != LoongArchII::MO_None) 376 return false; 377 Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask; 378 Reg = LoongArch::R0; 379 Instrs.push_back(Curr); 380 break; 381 } 382 case LoongArch::LU32I_D: { 383 MachineOperand ImmOp = Curr->getOperand(2); 384 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20) 385 return false; 386 Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask; 387 Mask ^= 0x000FFFFF00000000ULL; 388 Reg = Curr->getOperand(1).getReg(); 389 Instrs.push_back(Curr); 390 break; 391 } 392 case LoongArch::LU52I_D: { 393 MachineOperand ImmOp = Curr->getOperand(2); 394 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12) 395 return false; 396 Offset += ImmOp.getImm() << 52; 397 Mask ^= 0xFFF0000000000000ULL; 398 Reg = Curr->getOperand(1).getReg(); 399 Instrs.push_back(Curr); 400 break; 401 } 402 } 403 } 404 405 // Can't fold if the offset is not extracted. 406 if (!Offset) 407 return false; 408 409 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset); 410 LLVM_DEBUG(dbgs() << " Offset Instrs:\n"); 411 for (auto I : Instrs) { 412 LLVM_DEBUG(dbgs() << " " << *I); 413 I->eraseFromParent(); 414 } 415 416 return true; 417 } 418 419 bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20, 420 MachineInstr &Lo12, 421 MachineInstr *&Lo20, 422 MachineInstr *&Hi12, 423 MachineInstr *&Last) { 424 Register DestReg = 425 Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg(); 426 427 // Look for arithmetic instructions we can get an offset from. 428 // We might be able to remove the arithmetic instructions by folding the 429 // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or 430 // LU12I_W+PseudoAddTPRel+ADDI. 431 if (!MRI->hasOneUse(DestReg)) 432 return false; 433 434 // DestReg has only one use. 435 MachineInstr &Tail = *MRI->use_instr_begin(DestReg); 436 switch (Tail.getOpcode()) { 437 default: 438 LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:" 439 << Tail); 440 break; 441 case LoongArch::ADDI_W: 442 if (ST->is64Bit()) 443 return false; 444 [[fallthrough]]; 445 case LoongArch::ADDI_D: 446 case LoongArch::ADDU16I_D: { 447 // Offset is simply an immediate operand. 448 int64_t Offset = Tail.getOperand(2).getImm(); 449 if (Tail.getOpcode() == LoongArch::ADDU16I_D) 450 Offset = SignExtend64<32>(Offset << 16); 451 452 // We might have two ADDIs in a row. 453 Register TailDestReg = Tail.getOperand(0).getReg(); 454 if (MRI->hasOneUse(TailDestReg)) { 455 MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg); 456 if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W) 457 return false; 458 if (TailTail.getOpcode() == LoongArch::ADDI_W || 459 TailTail.getOpcode() == LoongArch::ADDI_D) { 460 Offset += TailTail.getOperand(2).getImm(); 461 LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail); 462 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset); 463 Tail.eraseFromParent(); 464 return true; 465 } 466 } 467 468 LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail); 469 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset); 470 return true; 471 } 472 case LoongArch::ADD_W: 473 if (ST->is64Bit()) 474 return false; 475 [[fallthrough]]; 476 case LoongArch::ADD_D: 477 // The offset is too large to fit in the immediate field of ADDI. 478 return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg); 479 break; 480 } 481 482 return false; 483 } 484 485 // Memory access opcode mapping for transforms. 486 static unsigned getNewOpc(unsigned Op, bool isLarge) { 487 switch (Op) { 488 case LoongArch::LD_B: 489 return isLarge ? LoongArch::LDX_B : LoongArch::LD_B; 490 case LoongArch::LD_H: 491 return isLarge ? LoongArch::LDX_H : LoongArch::LD_H; 492 case LoongArch::LD_W: 493 case LoongArch::LDPTR_W: 494 return isLarge ? LoongArch::LDX_W : LoongArch::LD_W; 495 case LoongArch::LD_D: 496 case LoongArch::LDPTR_D: 497 return isLarge ? LoongArch::LDX_D : LoongArch::LD_D; 498 case LoongArch::LD_BU: 499 return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU; 500 case LoongArch::LD_HU: 501 return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU; 502 case LoongArch::LD_WU: 503 return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU; 504 case LoongArch::FLD_S: 505 return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S; 506 case LoongArch::FLD_D: 507 return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D; 508 case LoongArch::VLD: 509 return isLarge ? LoongArch::VLDX : LoongArch::VLD; 510 case LoongArch::XVLD: 511 return isLarge ? LoongArch::XVLDX : LoongArch::XVLD; 512 case LoongArch::VLDREPL_B: 513 return LoongArch::VLDREPL_B; 514 case LoongArch::XVLDREPL_B: 515 return LoongArch::XVLDREPL_B; 516 case LoongArch::ST_B: 517 return isLarge ? LoongArch::STX_B : LoongArch::ST_B; 518 case LoongArch::ST_H: 519 return isLarge ? LoongArch::STX_H : LoongArch::ST_H; 520 case LoongArch::ST_W: 521 case LoongArch::STPTR_W: 522 return isLarge ? LoongArch::STX_W : LoongArch::ST_W; 523 case LoongArch::ST_D: 524 case LoongArch::STPTR_D: 525 return isLarge ? LoongArch::STX_D : LoongArch::ST_D; 526 case LoongArch::FST_S: 527 return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S; 528 case LoongArch::FST_D: 529 return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D; 530 case LoongArch::VST: 531 return isLarge ? LoongArch::VSTX : LoongArch::VST; 532 case LoongArch::XVST: 533 return isLarge ? LoongArch::XVSTX : LoongArch::XVST; 534 default: 535 llvm_unreachable("Unexpected opcode for replacement"); 536 } 537 } 538 539 bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20, 540 MachineInstr &Lo12, 541 MachineInstr *&Lo20, 542 MachineInstr *&Hi12, 543 MachineInstr *&Last) { 544 Register DestReg = 545 Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg(); 546 547 // If all the uses are memory ops with the same offset, we can transform: 548 // 549 // 1. (small/medium): 550 // 1.1. pcala 551 // pcalau12i vreg1, %pc_hi20(s) 552 // addi.d vreg2, vreg1, %pc_lo12(s) 553 // ld.w vreg3, 8(vreg2) 554 // 555 // => 556 // 557 // pcalau12i vreg1, %pc_hi20(s+8) 558 // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1) 559 // 560 // 1.2. tls-le 561 // lu12i.w vreg1, %le_hi20_r(s) 562 // add.w/d vreg2, vreg1, r2, %le_add_r(s) 563 // addi.w/d vreg3, vreg2, %le_lo12_r(s) 564 // ld.w vreg4, 8(vreg3) 565 // 566 // => 567 // 568 // lu12i.w vreg1, %le_hi20_r(s+8) 569 // add.w/d vreg2, vreg1, r2, %le_add_r(s+8) 570 // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2) 571 // 572 // 2. (large): 573 // pcalau12i vreg1, %pc_hi20(s) 574 // addi.d vreg2, $zero, %pc_lo12(s) 575 // lu32i.d vreg3, vreg2, %pc64_lo20(s) 576 // lu52i.d vreg4, vreg3, %pc64_hi12(s) 577 // add.d vreg5, vreg4, vreg1 578 // ld.w vreg6, 8(vreg5) 579 // 580 // => 581 // 582 // pcalau12i vreg1, %pc_hi20(s+8) 583 // addi.d vreg2, $zero, %pc_lo12(s+8) 584 // lu32i.d vreg3, vreg2, %pc64_lo20(s+8) 585 // lu52i.d vreg4, vreg3, %pc64_hi12(s+8) 586 // ldx.w vreg6, vreg4, vreg1 587 588 std::optional<int64_t> CommonOffset; 589 DenseMap<const MachineInstr *, SmallVector<unsigned>> 590 InlineAsmMemoryOpIndexesMap; 591 for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) { 592 switch (UseMI.getOpcode()) { 593 default: 594 LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI); 595 return false; 596 case LoongArch::VLDREPL_B: 597 case LoongArch::XVLDREPL_B: 598 // We can't do this for large pattern. 599 if (Last) 600 return false; 601 [[fallthrough]]; 602 case LoongArch::LD_B: 603 case LoongArch::LD_H: 604 case LoongArch::LD_W: 605 case LoongArch::LD_D: 606 case LoongArch::LD_BU: 607 case LoongArch::LD_HU: 608 case LoongArch::LD_WU: 609 case LoongArch::LDPTR_W: 610 case LoongArch::LDPTR_D: 611 case LoongArch::FLD_S: 612 case LoongArch::FLD_D: 613 case LoongArch::VLD: 614 case LoongArch::XVLD: 615 case LoongArch::ST_B: 616 case LoongArch::ST_H: 617 case LoongArch::ST_W: 618 case LoongArch::ST_D: 619 case LoongArch::STPTR_W: 620 case LoongArch::STPTR_D: 621 case LoongArch::FST_S: 622 case LoongArch::FST_D: 623 case LoongArch::VST: 624 case LoongArch::XVST: { 625 if (UseMI.getOperand(1).isFI()) 626 return false; 627 // Register defined by Lo should not be the value register. 628 if (DestReg == UseMI.getOperand(0).getReg()) 629 return false; 630 assert(DestReg == UseMI.getOperand(1).getReg() && 631 "Expected base address use"); 632 // All load/store instructions must use the same offset. 633 int64_t Offset = UseMI.getOperand(2).getImm(); 634 if (CommonOffset && Offset != CommonOffset) 635 return false; 636 CommonOffset = Offset; 637 break; 638 } 639 case LoongArch::INLINEASM: 640 case LoongArch::INLINEASM_BR: { 641 // We can't do this for large pattern. 642 if (Last) 643 return false; 644 SmallVector<unsigned> InlineAsmMemoryOpIndexes; 645 unsigned NumOps = 0; 646 for (unsigned I = InlineAsm::MIOp_FirstOperand; 647 I < UseMI.getNumOperands(); I += 1 + NumOps) { 648 const MachineOperand &FlagsMO = UseMI.getOperand(I); 649 // Should be an imm. 650 if (!FlagsMO.isImm()) 651 continue; 652 653 const InlineAsm::Flag Flags(FlagsMO.getImm()); 654 NumOps = Flags.getNumOperandRegisters(); 655 656 // Memory constraints have two operands. 657 if (NumOps != 2 || !Flags.isMemKind()) { 658 // If the register is used by something other than a memory contraint, 659 // we should not fold. 660 for (unsigned J = 0; J < NumOps; ++J) { 661 const MachineOperand &MO = UseMI.getOperand(I + 1 + J); 662 if (MO.isReg() && MO.getReg() == DestReg) 663 return false; 664 } 665 continue; 666 } 667 668 // We can only do this for constraint m. 669 if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m) 670 return false; 671 672 const MachineOperand &AddrMO = UseMI.getOperand(I + 1); 673 if (!AddrMO.isReg() || AddrMO.getReg() != DestReg) 674 continue; 675 676 const MachineOperand &OffsetMO = UseMI.getOperand(I + 2); 677 if (!OffsetMO.isImm()) 678 continue; 679 680 // All inline asm memory operands must use the same offset. 681 int64_t Offset = OffsetMO.getImm(); 682 if (CommonOffset && Offset != CommonOffset) 683 return false; 684 CommonOffset = Offset; 685 InlineAsmMemoryOpIndexes.push_back(I + 1); 686 } 687 InlineAsmMemoryOpIndexesMap.insert( 688 std::make_pair(&UseMI, InlineAsmMemoryOpIndexes)); 689 break; 690 } 691 } 692 } 693 694 // We found a common offset. 695 // Update the offsets in global address lowering. 696 // We may have already folded some arithmetic so we need to add to any 697 // existing offset. 698 int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset; 699 // LA32 ignores the upper 32 bits. 700 if (!ST->is64Bit()) 701 NewOffset = SignExtend64<32>(NewOffset); 702 // We can only fold simm32 offsets. 703 if (!isInt<32>(NewOffset)) 704 return false; 705 706 // If optimized by this pass successfully, MO_RELAX bitmask target-flag should 707 // be removed from the pcala code sequence. Code sequence of tls-le can still 708 // be relaxed after being optimized. 709 // 710 // For example: 711 // pcalau12i $a0, %pc_hi20(symbol) 712 // addi.d $a0, $a0, %pc_lo12(symbol) 713 // ld.w $a0, $a0, 0 714 // 715 // => 716 // 717 // pcalau12i $a0, %pc_hi20(symbol) 718 // ld.w $a0, $a0, %pc_lo12(symbol) 719 // 720 // Code sequence optimized before can be relax by linker. But after being 721 // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be 722 // carried by them. 723 Hi20.getOperand(1).setOffset(NewOffset); 724 MachineOperand &ImmOp = Lo12.getOperand(2); 725 ImmOp.setOffset(NewOffset); 726 if (Lo20 && Hi12) { 727 Lo20->getOperand(2).setOffset(NewOffset); 728 Hi12->getOperand(2).setOffset(NewOffset); 729 } 730 if (Hi20.getOpcode() == LoongArch::PCALAU12I) { 731 Hi20.getOperand(1).setTargetFlags( 732 LoongArchII::getDirectFlags(Hi20.getOperand(1))); 733 ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp)); 734 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { 735 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); 736 Add->getOperand(3).setOffset(NewOffset); 737 } 738 739 // Update the immediate in the load/store instructions to add the offset. 740 const LoongArchInstrInfo &TII = *ST->getInstrInfo(); 741 for (MachineInstr &UseMI : 742 llvm::make_early_inc_range(MRI->use_instructions(DestReg))) { 743 if (UseMI.getOpcode() == LoongArch::INLINEASM || 744 UseMI.getOpcode() == LoongArch::INLINEASM_BR) { 745 auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI]; 746 for (unsigned I : InlineAsmMemoryOpIndexes) { 747 MachineOperand &MO = UseMI.getOperand(I + 1); 748 switch (ImmOp.getType()) { 749 case MachineOperand::MO_GlobalAddress: 750 MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(), 751 LoongArchII::getDirectFlags(ImmOp)); 752 break; 753 case MachineOperand::MO_MCSymbol: 754 MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), 755 LoongArchII::getDirectFlags(ImmOp)); 756 MO.setOffset(ImmOp.getOffset()); 757 break; 758 case MachineOperand::MO_BlockAddress: 759 MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(), 760 LoongArchII::getDirectFlags(ImmOp)); 761 break; 762 case MachineOperand::MO_ConstantPoolIndex: 763 MO.ChangeToCPI(ImmOp.getIndex(), ImmOp.getOffset(), 764 LoongArchII::getDirectFlags(ImmOp)); 765 break; 766 default: 767 report_fatal_error("unsupported machine operand type"); 768 break; 769 } 770 } 771 } else { 772 UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last))); 773 if (Last) { 774 UseMI.removeOperand(2); 775 UseMI.removeOperand(1); 776 UseMI.addOperand(Last->getOperand(1)); 777 UseMI.addOperand(Last->getOperand(2)); 778 UseMI.getOperand(1).setIsKill(false); 779 UseMI.getOperand(2).setIsKill(false); 780 } else { 781 UseMI.removeOperand(2); 782 UseMI.addOperand(ImmOp); 783 } 784 } 785 } 786 787 if (Last) { 788 Last->eraseFromParent(); 789 return true; 790 } 791 792 if (Hi20.getOpcode() == LoongArch::PCALAU12I) { 793 MRI->replaceRegWith(Lo12.getOperand(0).getReg(), 794 Hi20.getOperand(0).getReg()); 795 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { 796 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg()); 797 MRI->replaceRegWith(Lo12.getOperand(0).getReg(), 798 Add->getOperand(0).getReg()); 799 } 800 Lo12.eraseFromParent(); 801 return true; 802 } 803 804 bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) { 805 if (skipFunction(Fn.getFunction())) 806 return false; 807 808 ST = &Fn.getSubtarget<LoongArchSubtarget>(); 809 810 bool MadeChange = false; 811 MRI = &Fn.getRegInfo(); 812 for (MachineBasicBlock &MBB : Fn) { 813 LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n"); 814 for (MachineInstr &Hi20 : MBB) { 815 MachineInstr *Lo12 = nullptr; 816 MachineInstr *Lo20 = nullptr; 817 MachineInstr *Hi12 = nullptr; 818 MachineInstr *Last = nullptr; 819 if (Hi20.getOpcode() == LoongArch::PCALAU12I) { 820 // Detect foldable pcala code sequence in small/medium/large code model. 821 if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last)) 822 continue; 823 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) { 824 MachineInstr *Add = nullptr; 825 // Detect foldable tls-le code sequence in small/medium code model. 826 if (!detectFoldable(Hi20, Add, Lo12)) 827 continue; 828 } else { 829 continue; 830 } 831 // For tls-le, we do not pass the second PseudoAddTPRel instr in order to 832 // reuse the existing hooks and the last three paramaters should always be 833 // nullptr. 834 MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last); 835 MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last); 836 } 837 } 838 839 return MadeChange; 840 } 841 842 /// Returns an instance of the Merge Base Offset Optimization pass. 843 FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() { 844 return new LoongArchMergeBaseOffsetOpt(); 845 } 846