xref: /freebsd/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Merge the offset of address calculation into the offset field
10 // of instructions in a global address lowering sequence.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "LoongArch.h"
15 #include "LoongArchTargetMachine.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/Passes.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Target/TargetOptions.h"
21 #include <optional>
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "loongarch-merge-base-offset"
26 #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
27 
28 namespace {
29 
30 class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
31   const LoongArchSubtarget *ST = nullptr;
32   MachineRegisterInfo *MRI;
33 
34 public:
35   static char ID;
36   bool runOnMachineFunction(MachineFunction &Fn) override;
37   bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
38                       MachineInstr *&Lo20, MachineInstr *&Hi12,
39                       MachineInstr *&Last);
40   bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
41                       MachineInstr *&Lo12);
42 
43   bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
44                            MachineInstr *&Lo20, MachineInstr *&Hi12,
45                            MachineInstr *&Last);
46   void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
47                   MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
48                   int64_t Offset);
49   bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
50                        MachineInstr *&Lo20, MachineInstr *&Hi12,
51                        MachineInstr *&Last, MachineInstr &TailAdd,
52                        Register GAReg);
53 
54   bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
55                          MachineInstr *&Lo20, MachineInstr *&Hi12,
56                          MachineInstr *&Last);
57 
LoongArchMergeBaseOffsetOpt()58   LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
59 
getRequiredProperties() const60   MachineFunctionProperties getRequiredProperties() const override {
61     return MachineFunctionProperties().setIsSSA();
62   }
63 
getAnalysisUsage(AnalysisUsage & AU) const64   void getAnalysisUsage(AnalysisUsage &AU) const override {
65     AU.setPreservesCFG();
66     MachineFunctionPass::getAnalysisUsage(AU);
67   }
68 
getPassName() const69   StringRef getPassName() const override {
70     return LoongArch_MERGE_BASE_OFFSET_NAME;
71   }
72 };
73 } // end anonymous namespace
74 
75 char LoongArchMergeBaseOffsetOpt::ID = 0;
INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt,DEBUG_TYPE,LoongArch_MERGE_BASE_OFFSET_NAME,false,false)76 INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
77                 LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
78 
79 // Detect either of the patterns:
80 //
81 // 1. (small/medium):
82 //   pcalau12i vreg1, %pc_hi20(s)
83 //   addi.d    vreg2, vreg1, %pc_lo12(s)
84 //
85 // 2. (large):
86 //   pcalau12i vreg1, %pc_hi20(s)
87 //   addi.d    vreg2, $zero, %pc_lo12(s)
88 //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
89 //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
90 //   add.d     vreg5, vreg4, vreg1
91 
92 // The pattern is only accepted if:
93 //    1) For small and medium pattern, the first instruction has only one use,
94 //       which is the ADDI.
95 //    2) For large pattern, the first four instructions each have only one use,
96 //       and the user of the fourth instruction is ADD.
97 //    3) The address operands have the appropriate type, reflecting the
98 //       lowering of a global address or constant pool using the pattern.
99 //    4) The offset value in the Global Address or Constant Pool is 0.
100 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
101                                                  MachineInstr *&Lo12,
102                                                  MachineInstr *&Lo20,
103                                                  MachineInstr *&Hi12,
104                                                  MachineInstr *&Last) {
105   if (Hi20.getOpcode() != LoongArch::PCALAU12I)
106     return false;
107 
108   const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
109   if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)
110     return false;
111 
112   auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
113     return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
114   };
115 
116   if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
117     return false;
118 
119   Register HiDestReg = Hi20.getOperand(0).getReg();
120   if (!MRI->hasOneUse(HiDestReg))
121     return false;
122 
123   MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
124   if (UseInst->getOpcode() != LoongArch::ADD_D) {
125     Lo12 = UseInst;
126     if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
127         (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
128       return false;
129   } else {
130     assert(ST->is64Bit());
131     Last = UseInst;
132 
133     Register LastOp1Reg = Last->getOperand(1).getReg();
134     if (!LastOp1Reg.isVirtual())
135       return false;
136     Hi12 = MRI->getVRegDef(LastOp1Reg);
137     const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
138     if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
139       return false;
140     if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
141       return false;
142     if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
143       return false;
144 
145     Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
146     const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
147     if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
148       return false;
149     if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
150       return false;
151     if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
152       return false;
153 
154     Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
155     if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
156       return false;
157   }
158 
159   const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
160   assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
161   if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||
162       !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
163       Lo12Op2.getOffset() != 0)
164     return false;
165 
166   if (Hi20Op1.isGlobal()) {
167     LLVM_DEBUG(dbgs() << "  Found lowered global address: "
168                       << *Hi20Op1.getGlobal() << "\n");
169   } else if (Hi20Op1.isBlockAddress()) {
170     LLVM_DEBUG(dbgs() << "  Found lowered basic address: "
171                       << *Hi20Op1.getBlockAddress() << "\n");
172   } else if (Hi20Op1.isCPI()) {
173     LLVM_DEBUG(dbgs() << "  Found lowered constant pool: " << Hi20Op1.getIndex()
174                       << "\n");
175   }
176 
177   return true;
178 }
179 
180 // Detect the pattern:
181 //
182 // (small/medium):
183 //   lu12i.w  vreg1, %le_hi20_r(s)
184 //   add.w/d  vreg2, vreg1, r2, %le_add_r(s)
185 //   addi.w/d vreg3, vreg2, %le_lo12_r(s)
186 
187 // The pattern is only accepted if:
188 //    1) The first instruction has only one use, which is the PseudoAddTPRel.
189 //       The second instruction has only one use, which is the ADDI. The
190 //       second instruction's last operand is the tp register.
191 //    2) The address operands have the appropriate type, reflecting the
192 //       lowering of a thread_local global address using the pattern.
193 //    3) The offset value in the ThreadLocal Global Address is 0.
detectFoldable(MachineInstr & Hi20,MachineInstr * & Add,MachineInstr * & Lo12)194 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
195                                                  MachineInstr *&Add,
196                                                  MachineInstr *&Lo12) {
197   if (Hi20.getOpcode() != LoongArch::LU12I_W)
198     return false;
199 
200   auto isGlobalOrCPI = [](const MachineOperand &Op) {
201     return Op.isGlobal() || Op.isCPI();
202   };
203 
204   const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
205   if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
206       !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
207     return false;
208 
209   Register HiDestReg = Hi20.getOperand(0).getReg();
210   if (!MRI->hasOneUse(HiDestReg))
211     return false;
212 
213   Add = &*MRI->use_instr_begin(HiDestReg);
214   if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
215       (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
216     return false;
217 
218   if (Add->getOperand(2).getReg() != LoongArch::R2)
219     return false;
220 
221   const MachineOperand &AddOp3 = Add->getOperand(3);
222   if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
223       !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
224       AddOp3.getOffset() != 0)
225     return false;
226 
227   Register AddDestReg = Add->getOperand(0).getReg();
228   if (!MRI->hasOneUse(AddDestReg))
229     return false;
230 
231   Lo12 = &*MRI->use_instr_begin(AddDestReg);
232   if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
233       (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
234     return false;
235 
236   const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
237   if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
238       !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
239       Lo12Op2.getOffset() != 0)
240     return false;
241 
242   if (Hi20Op1.isGlobal()) {
243     LLVM_DEBUG(dbgs() << "  Found lowered global address: "
244                       << *Hi20Op1.getGlobal() << "\n");
245   } else if (Hi20Op1.isCPI()) {
246     LLVM_DEBUG(dbgs() << "  Found lowered constant pool: " << Hi20Op1.getIndex()
247                       << "\n");
248   }
249 
250   return true;
251 }
252 
253 // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
254 // Delete the tail instruction and update all the uses to use the
255 // output from Last.
foldOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last,MachineInstr & Tail,int64_t Offset)256 void LoongArchMergeBaseOffsetOpt::foldOffset(
257     MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
258     MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
259     int64_t Offset) {
260   // Put the offset back in Hi and the Lo
261   Hi20.getOperand(1).setOffset(Offset);
262   Lo12.getOperand(2).setOffset(Offset);
263   if (Lo20 && Hi12) {
264     Lo20->getOperand(2).setOffset(Offset);
265     Hi12->getOperand(2).setOffset(Offset);
266   }
267 
268   // For tls-le, offset of the second PseudoAddTPRel instr should also be
269   // updated.
270   MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
271   if (Hi20.getOpcode() == LoongArch::LU12I_W)
272     Add->getOperand(3).setOffset(Offset);
273 
274   // Delete the tail instruction.
275   MachineInstr *Def = Last ? Last : &Lo12;
276   MRI->constrainRegClass(Def->getOperand(0).getReg(),
277                          MRI->getRegClass(Tail.getOperand(0).getReg()));
278   MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
279   Tail.eraseFromParent();
280 
281   LLVM_DEBUG(dbgs() << "  Merged offset " << Offset << " into base.\n"
282                     << "     " << Hi20;);
283   if (Hi20.getOpcode() == LoongArch::LU12I_W) {
284     LLVM_DEBUG(dbgs() << "     " << *Add;);
285   }
286   LLVM_DEBUG(dbgs() << "     " << Lo12;);
287   if (Lo20 && Hi12) {
288     LLVM_DEBUG(dbgs() << "     " << *Lo20 << "     " << *Hi12;);
289   }
290 }
291 
292 // Detect patterns for large offsets that are passed into an ADD instruction.
293 // If the pattern is found, updates the offset in Hi20, (Add), Lo12,
294 // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
295 // produced the offset.
296 //
297 //   (The instructions marked with "!" are not necessarily present)
298 //
299 //        Base address lowering is of the form:
300 //           1) pcala:
301 //             Hi20:  pcalau12i vreg1, %pc_hi20(s)
302 //        +--- Lo12:  addi.d vreg2, vreg1, %pc_lo12(s)
303 //        |    Lo20:  lu32i.d vreg2, %pc64_lo20(s) !
304 //        +--- Hi12:  lu52i.d vreg2, vreg2, %pc64_hi12(s) !
305 //        |
306 //        |  2) tls-le:
307 //        |    Hi20:  lu12i.w vreg1, %le_hi20_r(s)
308 //        |    Add:   add.w/d vreg1, vreg1, r2, %le_add_r(s)
309 //        +--- Lo12:  addi.w/d vreg2, vreg1, %le_lo12_r(s)
310 //        |
311 //        | The large offset can be one of the forms:
312 //        |
313 //        +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
314 //        |     OffsetHi20: lu12i.w vreg3, 4
315 //        |     OffsetLo12: ori voff, vreg3, 188    ------------------+
316 //        |                                                           |
317 //        +-> 2) Offset that has non zero bits in Hi20 bits only:     |
318 //        |     OffsetHi20: lu12i.w voff, 128       ------------------+
319 //        |                                                           |
320 //        +-> 3) Offset that has non zero bits in Lo20 bits:          |
321 //        |     OffsetHi20: lu12i.w vreg3, 121 !                      |
322 //        |     OffsetLo12: ori voff, vreg3, 122 !                    |
323 //        |     OffsetLo20: lu32i.d voff, 123       ------------------+
324 //        +-> 4) Offset that has non zero bits in Hi12 bits:          |
325 //              OffsetHi20: lu12i.w vreg3, 121 !                      |
326 //              OffsetLo12: ori voff, vreg3, 122 !                    |
327 //              OffsetLo20: lu32i.d vreg3, 123 !                      |
328 //              OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
329 //                                                                    |
330 //        TailAdd: add.d  vreg4, vreg2, voff       <------------------+
331 //
foldLargeOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last,MachineInstr & TailAdd,Register GAReg)332 bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
333     MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
334     MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
335     Register GAReg) {
336   assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
337           TailAdd.getOpcode() == LoongArch::ADD_D) &&
338          "Expected ADD instruction!");
339   Register Rs = TailAdd.getOperand(1).getReg();
340   Register Rt = TailAdd.getOperand(2).getReg();
341   Register Reg = Rs == GAReg ? Rt : Rs;
342   SmallVector<MachineInstr *, 4> Instrs;
343   int64_t Offset = 0;
344   int64_t Mask = -1;
345 
346   // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
347   for (int i = 0; i < 4; i++) {
348     // Handle Reg is R0.
349     if (Reg == LoongArch::R0)
350       break;
351 
352     // Can't fold if the register has more than one use.
353     if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
354       return false;
355 
356     MachineInstr *Curr = MRI->getVRegDef(Reg);
357     if (!Curr)
358       break;
359 
360     switch (Curr->getOpcode()) {
361     default:
362       // Can't fold if the instruction opcode is unexpected.
363       return false;
364     case LoongArch::ORI: {
365       MachineOperand ImmOp = Curr->getOperand(2);
366       if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
367         return false;
368       Offset += ImmOp.getImm();
369       Reg = Curr->getOperand(1).getReg();
370       Instrs.push_back(Curr);
371       break;
372     }
373     case LoongArch::LU12I_W: {
374       MachineOperand ImmOp = Curr->getOperand(1);
375       if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
376         return false;
377       Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;
378       Reg = LoongArch::R0;
379       Instrs.push_back(Curr);
380       break;
381     }
382     case LoongArch::LU32I_D: {
383       MachineOperand ImmOp = Curr->getOperand(2);
384       if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
385         return false;
386       Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;
387       Mask ^= 0x000FFFFF00000000ULL;
388       Reg = Curr->getOperand(1).getReg();
389       Instrs.push_back(Curr);
390       break;
391     }
392     case LoongArch::LU52I_D: {
393       MachineOperand ImmOp = Curr->getOperand(2);
394       if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
395         return false;
396       Offset += ImmOp.getImm() << 52;
397       Mask ^= 0xFFF0000000000000ULL;
398       Reg = Curr->getOperand(1).getReg();
399       Instrs.push_back(Curr);
400       break;
401     }
402     }
403   }
404 
405   // Can't fold if the offset is not extracted.
406   if (!Offset)
407     return false;
408 
409   foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
410   LLVM_DEBUG(dbgs() << "  Offset Instrs:\n");
411   for (auto I : Instrs) {
412     LLVM_DEBUG(dbgs() << "                 " << *I);
413     I->eraseFromParent();
414   }
415 
416   return true;
417 }
418 
detectAndFoldOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last)419 bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
420                                                       MachineInstr &Lo12,
421                                                       MachineInstr *&Lo20,
422                                                       MachineInstr *&Hi12,
423                                                       MachineInstr *&Last) {
424   Register DestReg =
425       Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
426 
427   // Look for arithmetic instructions we can get an offset from.
428   // We might be able to remove the arithmetic instructions by folding the
429   // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
430   // LU12I_W+PseudoAddTPRel+ADDI.
431   if (!MRI->hasOneUse(DestReg))
432     return false;
433 
434   // DestReg has only one use.
435   MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
436   switch (Tail.getOpcode()) {
437   default:
438     LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
439                       << Tail);
440     break;
441   case LoongArch::ADDI_W:
442     if (ST->is64Bit())
443       return false;
444     [[fallthrough]];
445   case LoongArch::ADDI_D:
446   case LoongArch::ADDU16I_D: {
447     // Offset is simply an immediate operand.
448     int64_t Offset = Tail.getOperand(2).getImm();
449     if (Tail.getOpcode() == LoongArch::ADDU16I_D)
450       Offset = SignExtend64<32>(Offset << 16);
451 
452     // We might have two ADDIs in a row.
453     Register TailDestReg = Tail.getOperand(0).getReg();
454     if (MRI->hasOneUse(TailDestReg)) {
455       MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
456       if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
457         return false;
458       if (TailTail.getOpcode() == LoongArch::ADDI_W ||
459           TailTail.getOpcode() == LoongArch::ADDI_D) {
460         Offset += TailTail.getOperand(2).getImm();
461         LLVM_DEBUG(dbgs() << "  Offset Instrs: " << Tail << TailTail);
462         foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
463         Tail.eraseFromParent();
464         return true;
465       }
466     }
467 
468     LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
469     foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
470     return true;
471   }
472   case LoongArch::ADD_W:
473     if (ST->is64Bit())
474       return false;
475     [[fallthrough]];
476   case LoongArch::ADD_D:
477     // The offset is too large to fit in the immediate field of ADDI.
478     return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
479     break;
480   }
481 
482   return false;
483 }
484 
485 // Memory access opcode mapping for transforms.
getNewOpc(unsigned Op,bool isLarge)486 static unsigned getNewOpc(unsigned Op, bool isLarge) {
487   switch (Op) {
488   case LoongArch::LD_B:
489     return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
490   case LoongArch::LD_H:
491     return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
492   case LoongArch::LD_W:
493   case LoongArch::LDPTR_W:
494     return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
495   case LoongArch::LD_D:
496   case LoongArch::LDPTR_D:
497     return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
498   case LoongArch::LD_BU:
499     return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
500   case LoongArch::LD_HU:
501     return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
502   case LoongArch::LD_WU:
503     return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
504   case LoongArch::FLD_S:
505     return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
506   case LoongArch::FLD_D:
507     return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
508   case LoongArch::VLD:
509     return isLarge ? LoongArch::VLDX : LoongArch::VLD;
510   case LoongArch::XVLD:
511     return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;
512   case LoongArch::VLDREPL_B:
513     return LoongArch::VLDREPL_B;
514   case LoongArch::XVLDREPL_B:
515     return LoongArch::XVLDREPL_B;
516   case LoongArch::ST_B:
517     return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
518   case LoongArch::ST_H:
519     return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
520   case LoongArch::ST_W:
521   case LoongArch::STPTR_W:
522     return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
523   case LoongArch::ST_D:
524   case LoongArch::STPTR_D:
525     return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
526   case LoongArch::FST_S:
527     return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
528   case LoongArch::FST_D:
529     return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
530   case LoongArch::VST:
531     return isLarge ? LoongArch::VSTX : LoongArch::VST;
532   case LoongArch::XVST:
533     return isLarge ? LoongArch::XVSTX : LoongArch::XVST;
534   default:
535     llvm_unreachable("Unexpected opcode for replacement");
536   }
537 }
538 
foldIntoMemoryOps(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last)539 bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
540                                                     MachineInstr &Lo12,
541                                                     MachineInstr *&Lo20,
542                                                     MachineInstr *&Hi12,
543                                                     MachineInstr *&Last) {
544   Register DestReg =
545       Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
546 
547   // If all the uses are memory ops with the same offset, we can transform:
548   //
549   // 1. (small/medium):
550   //  1.1. pcala
551   //   pcalau12i vreg1, %pc_hi20(s)
552   //   addi.d    vreg2, vreg1, %pc_lo12(s)
553   //   ld.w      vreg3, 8(vreg2)
554   //
555   //   =>
556   //
557   //   pcalau12i vreg1, %pc_hi20(s+8)
558   //   ld.w      vreg3, vreg1, %pc_lo12(s+8)(vreg1)
559   //
560   //  1.2. tls-le
561   //   lu12i.w  vreg1, %le_hi20_r(s)
562   //   add.w/d  vreg2, vreg1, r2, %le_add_r(s)
563   //   addi.w/d vreg3, vreg2, %le_lo12_r(s)
564   //   ld.w     vreg4, 8(vreg3)
565   //
566   //   =>
567   //
568   //   lu12i.w vreg1, %le_hi20_r(s+8)
569   //   add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
570   //   ld.w    vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
571   //
572   // 2. (large):
573   //   pcalau12i vreg1, %pc_hi20(s)
574   //   addi.d    vreg2, $zero, %pc_lo12(s)
575   //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
576   //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
577   //   add.d     vreg5, vreg4, vreg1
578   //   ld.w      vreg6, 8(vreg5)
579   //
580   //   =>
581   //
582   //   pcalau12i vreg1, %pc_hi20(s+8)
583   //   addi.d    vreg2, $zero, %pc_lo12(s+8)
584   //   lu32i.d   vreg3, vreg2, %pc64_lo20(s+8)
585   //   lu52i.d   vreg4, vreg3, %pc64_hi12(s+8)
586   //   ldx.w     vreg6, vreg4, vreg1
587 
588   std::optional<int64_t> CommonOffset;
589   DenseMap<const MachineInstr *, SmallVector<unsigned>>
590       InlineAsmMemoryOpIndexesMap;
591   for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
592     switch (UseMI.getOpcode()) {
593     default:
594       LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
595       return false;
596     case LoongArch::VLDREPL_B:
597     case LoongArch::XVLDREPL_B:
598       // We can't do this for large pattern.
599       if (Last)
600         return false;
601       [[fallthrough]];
602     case LoongArch::LD_B:
603     case LoongArch::LD_H:
604     case LoongArch::LD_W:
605     case LoongArch::LD_D:
606     case LoongArch::LD_BU:
607     case LoongArch::LD_HU:
608     case LoongArch::LD_WU:
609     case LoongArch::LDPTR_W:
610     case LoongArch::LDPTR_D:
611     case LoongArch::FLD_S:
612     case LoongArch::FLD_D:
613     case LoongArch::VLD:
614     case LoongArch::XVLD:
615     case LoongArch::ST_B:
616     case LoongArch::ST_H:
617     case LoongArch::ST_W:
618     case LoongArch::ST_D:
619     case LoongArch::STPTR_W:
620     case LoongArch::STPTR_D:
621     case LoongArch::FST_S:
622     case LoongArch::FST_D:
623     case LoongArch::VST:
624     case LoongArch::XVST: {
625       if (UseMI.getOperand(1).isFI())
626         return false;
627       // Register defined by Lo should not be the value register.
628       if (DestReg == UseMI.getOperand(0).getReg())
629         return false;
630       assert(DestReg == UseMI.getOperand(1).getReg() &&
631              "Expected base address use");
632       // All load/store instructions must use the same offset.
633       int64_t Offset = UseMI.getOperand(2).getImm();
634       if (CommonOffset && Offset != CommonOffset)
635         return false;
636       CommonOffset = Offset;
637       break;
638     }
639     case LoongArch::INLINEASM:
640     case LoongArch::INLINEASM_BR: {
641       // We can't do this for large pattern.
642       if (Last)
643         return false;
644       SmallVector<unsigned> InlineAsmMemoryOpIndexes;
645       unsigned NumOps = 0;
646       for (unsigned I = InlineAsm::MIOp_FirstOperand;
647            I < UseMI.getNumOperands(); I += 1 + NumOps) {
648         const MachineOperand &FlagsMO = UseMI.getOperand(I);
649         // Should be an imm.
650         if (!FlagsMO.isImm())
651           continue;
652 
653         const InlineAsm::Flag Flags(FlagsMO.getImm());
654         NumOps = Flags.getNumOperandRegisters();
655 
656         // Memory constraints have two operands.
657         if (NumOps != 2 || !Flags.isMemKind()) {
658           // If the register is used by something other than a memory contraint,
659           // we should not fold.
660           for (unsigned J = 0; J < NumOps; ++J) {
661             const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
662             if (MO.isReg() && MO.getReg() == DestReg)
663               return false;
664           }
665           continue;
666         }
667 
668         // We can only do this for constraint m.
669         if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
670           return false;
671 
672         const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
673         if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
674           continue;
675 
676         const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
677         if (!OffsetMO.isImm())
678           continue;
679 
680         // All inline asm memory operands must use the same offset.
681         int64_t Offset = OffsetMO.getImm();
682         if (CommonOffset && Offset != CommonOffset)
683           return false;
684         CommonOffset = Offset;
685         InlineAsmMemoryOpIndexes.push_back(I + 1);
686       }
687       InlineAsmMemoryOpIndexesMap.insert(
688           std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
689       break;
690     }
691     }
692   }
693 
694   // We found a common offset.
695   // Update the offsets in global address lowering.
696   // We may have already folded some arithmetic so we need to add to any
697   // existing offset.
698   int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
699   // LA32 ignores the upper 32 bits.
700   if (!ST->is64Bit())
701     NewOffset = SignExtend64<32>(NewOffset);
702   // We can only fold simm32 offsets.
703   if (!isInt<32>(NewOffset))
704     return false;
705 
706   // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
707   // be removed from the pcala code sequence. Code sequence of tls-le can still
708   // be relaxed after being optimized.
709   //
710   // For example:
711   //   pcalau12i $a0, %pc_hi20(symbol)
712   //   addi.d $a0, $a0, %pc_lo12(symbol)
713   //   ld.w $a0, $a0, 0
714   //
715   //   =>
716   //
717   //   pcalau12i $a0, %pc_hi20(symbol)
718   //   ld.w $a0, $a0, %pc_lo12(symbol)
719   //
720   // Code sequence optimized before can be relax by linker. But after being
721   // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
722   // carried by them.
723   Hi20.getOperand(1).setOffset(NewOffset);
724   MachineOperand &ImmOp = Lo12.getOperand(2);
725   ImmOp.setOffset(NewOffset);
726   if (Lo20 && Hi12) {
727     Lo20->getOperand(2).setOffset(NewOffset);
728     Hi12->getOperand(2).setOffset(NewOffset);
729   }
730   if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
731     Hi20.getOperand(1).setTargetFlags(
732         LoongArchII::getDirectFlags(Hi20.getOperand(1)));
733     ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
734   } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
735     MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
736     Add->getOperand(3).setOffset(NewOffset);
737   }
738 
739   // Update the immediate in the load/store instructions to add the offset.
740   const LoongArchInstrInfo &TII = *ST->getInstrInfo();
741   for (MachineInstr &UseMI :
742        llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
743     if (UseMI.getOpcode() == LoongArch::INLINEASM ||
744         UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
745       auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
746       for (unsigned I : InlineAsmMemoryOpIndexes) {
747         MachineOperand &MO = UseMI.getOperand(I + 1);
748         switch (ImmOp.getType()) {
749         case MachineOperand::MO_GlobalAddress:
750           MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
751                         LoongArchII::getDirectFlags(ImmOp));
752           break;
753         case MachineOperand::MO_MCSymbol:
754           MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),
755                               LoongArchII::getDirectFlags(ImmOp));
756           MO.setOffset(ImmOp.getOffset());
757           break;
758         case MachineOperand::MO_BlockAddress:
759           MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
760                         LoongArchII::getDirectFlags(ImmOp));
761           break;
762         case MachineOperand::MO_ConstantPoolIndex:
763           MO.ChangeToCPI(ImmOp.getIndex(), ImmOp.getOffset(),
764                          LoongArchII::getDirectFlags(ImmOp));
765           break;
766         default:
767           report_fatal_error("unsupported machine operand type");
768           break;
769         }
770       }
771     } else {
772       UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
773       if (Last) {
774         UseMI.removeOperand(2);
775         UseMI.removeOperand(1);
776         UseMI.addOperand(Last->getOperand(1));
777         UseMI.addOperand(Last->getOperand(2));
778         UseMI.getOperand(1).setIsKill(false);
779         UseMI.getOperand(2).setIsKill(false);
780       } else {
781         UseMI.removeOperand(2);
782         UseMI.addOperand(ImmOp);
783       }
784     }
785   }
786 
787   if (Last) {
788     Last->eraseFromParent();
789     return true;
790   }
791 
792   if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
793     MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
794                         Hi20.getOperand(0).getReg());
795   } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
796     MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
797     MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
798                         Add->getOperand(0).getReg());
799   }
800   Lo12.eraseFromParent();
801   return true;
802 }
803 
runOnMachineFunction(MachineFunction & Fn)804 bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
805   if (skipFunction(Fn.getFunction()))
806     return false;
807 
808   ST = &Fn.getSubtarget<LoongArchSubtarget>();
809 
810   bool MadeChange = false;
811   MRI = &Fn.getRegInfo();
812   for (MachineBasicBlock &MBB : Fn) {
813     LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
814     for (MachineInstr &Hi20 : MBB) {
815       MachineInstr *Lo12 = nullptr;
816       MachineInstr *Lo20 = nullptr;
817       MachineInstr *Hi12 = nullptr;
818       MachineInstr *Last = nullptr;
819       if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
820         // Detect foldable pcala code sequence in small/medium/large code model.
821         if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
822           continue;
823       } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
824         MachineInstr *Add = nullptr;
825         // Detect foldable tls-le code sequence in small/medium code model.
826         if (!detectFoldable(Hi20, Add, Lo12))
827           continue;
828       } else {
829         continue;
830       }
831       // For tls-le, we do not pass the second PseudoAddTPRel instr in order to
832       // reuse the existing hooks and the last three paramaters should always be
833       // nullptr.
834       MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
835       MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
836     }
837   }
838 
839   return MadeChange;
840 }
841 
842 /// Returns an instance of the Merge Base Offset Optimization pass.
createLoongArchMergeBaseOffsetOptPass()843 FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
844   return new LoongArchMergeBaseOffsetOpt();
845 }
846