1 //===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Merge the offset of address calculation into the offset field
10 // of instructions in a global address lowering sequence.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "LoongArch.h"
15 #include "LoongArchTargetMachine.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/Passes.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/Debug.h"
20 #include "llvm/Target/TargetOptions.h"
21 #include <optional>
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "loongarch-merge-base-offset"
26 #define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
27
28 namespace {
29
30 class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
31 const LoongArchSubtarget *ST = nullptr;
32 MachineRegisterInfo *MRI;
33
34 public:
35 static char ID;
36 bool runOnMachineFunction(MachineFunction &Fn) override;
37 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
38 MachineInstr *&Lo20, MachineInstr *&Hi12,
39 MachineInstr *&Last);
40 bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
41 MachineInstr *&Lo12);
42
43 bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
44 MachineInstr *&Lo20, MachineInstr *&Hi12,
45 MachineInstr *&Last);
46 void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
47 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
48 int64_t Offset);
49 bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
50 MachineInstr *&Lo20, MachineInstr *&Hi12,
51 MachineInstr *&Last, MachineInstr &TailAdd,
52 Register GAReg);
53
54 bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
55 MachineInstr *&Lo20, MachineInstr *&Hi12,
56 MachineInstr *&Last);
57
LoongArchMergeBaseOffsetOpt()58 LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
59
getRequiredProperties() const60 MachineFunctionProperties getRequiredProperties() const override {
61 return MachineFunctionProperties().setIsSSA();
62 }
63
getAnalysisUsage(AnalysisUsage & AU) const64 void getAnalysisUsage(AnalysisUsage &AU) const override {
65 AU.setPreservesCFG();
66 MachineFunctionPass::getAnalysisUsage(AU);
67 }
68
getPassName() const69 StringRef getPassName() const override {
70 return LoongArch_MERGE_BASE_OFFSET_NAME;
71 }
72 };
73 } // end anonymous namespace
74
75 char LoongArchMergeBaseOffsetOpt::ID = 0;
INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt,DEBUG_TYPE,LoongArch_MERGE_BASE_OFFSET_NAME,false,false)76 INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
77 LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
78
79 // Detect either of the patterns:
80 //
81 // 1. (small/medium):
82 // pcalau12i vreg1, %pc_hi20(s)
83 // addi.d vreg2, vreg1, %pc_lo12(s)
84 //
85 // 2. (large):
86 // pcalau12i vreg1, %pc_hi20(s)
87 // addi.d vreg2, $zero, %pc_lo12(s)
88 // lu32i.d vreg3, vreg2, %pc64_lo20(s)
89 // lu52i.d vreg4, vreg3, %pc64_hi12(s)
90 // add.d vreg5, vreg4, vreg1
91
92 // The pattern is only accepted if:
93 // 1) For small and medium pattern, the first instruction has only one use,
94 // which is the ADDI.
95 // 2) For large pattern, the first four instructions each have only one use,
96 // and the user of the fourth instruction is ADD.
97 // 3) The address operands have the appropriate type, reflecting the
98 // lowering of a global address or constant pool using the pattern.
99 // 4) The offset value in the Global Address or Constant Pool is 0.
100 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
101 MachineInstr *&Lo12,
102 MachineInstr *&Lo20,
103 MachineInstr *&Hi12,
104 MachineInstr *&Last) {
105 if (Hi20.getOpcode() != LoongArch::PCALAU12I)
106 return false;
107
108 const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
109 if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_PCREL_HI)
110 return false;
111
112 auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
113 return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
114 };
115
116 if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
117 return false;
118
119 Register HiDestReg = Hi20.getOperand(0).getReg();
120 if (!MRI->hasOneUse(HiDestReg))
121 return false;
122
123 MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
124 if (UseInst->getOpcode() != LoongArch::ADD_D) {
125 Lo12 = UseInst;
126 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
127 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
128 return false;
129 } else {
130 assert(ST->is64Bit());
131 Last = UseInst;
132
133 Register LastOp1Reg = Last->getOperand(1).getReg();
134 if (!LastOp1Reg.isVirtual())
135 return false;
136 Hi12 = MRI->getVRegDef(LastOp1Reg);
137 const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
138 if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
139 return false;
140 if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
141 return false;
142 if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
143 return false;
144
145 Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
146 const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
147 if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
148 return false;
149 if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
150 return false;
151 if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
152 return false;
153
154 Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
155 if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
156 return false;
157 }
158
159 const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
160 assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
161 if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_PCREL_LO ||
162 !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
163 Lo12Op2.getOffset() != 0)
164 return false;
165
166 if (Hi20Op1.isGlobal()) {
167 LLVM_DEBUG(dbgs() << " Found lowered global address: "
168 << *Hi20Op1.getGlobal() << "\n");
169 } else if (Hi20Op1.isBlockAddress()) {
170 LLVM_DEBUG(dbgs() << " Found lowered basic address: "
171 << *Hi20Op1.getBlockAddress() << "\n");
172 } else if (Hi20Op1.isCPI()) {
173 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
174 << "\n");
175 }
176
177 return true;
178 }
179
180 // Detect the pattern:
181 //
182 // (small/medium):
183 // lu12i.w vreg1, %le_hi20_r(s)
184 // add.w/d vreg2, vreg1, r2, %le_add_r(s)
185 // addi.w/d vreg3, vreg2, %le_lo12_r(s)
186
187 // The pattern is only accepted if:
188 // 1) The first instruction has only one use, which is the PseudoAddTPRel.
189 // The second instruction has only one use, which is the ADDI. The
190 // second instruction's last operand is the tp register.
191 // 2) The address operands have the appropriate type, reflecting the
192 // lowering of a thread_local global address using the pattern.
193 // 3) The offset value in the ThreadLocal Global Address is 0.
detectFoldable(MachineInstr & Hi20,MachineInstr * & Add,MachineInstr * & Lo12)194 bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
195 MachineInstr *&Add,
196 MachineInstr *&Lo12) {
197 if (Hi20.getOpcode() != LoongArch::LU12I_W)
198 return false;
199
200 auto isGlobalOrCPI = [](const MachineOperand &Op) {
201 return Op.isGlobal() || Op.isCPI();
202 };
203
204 const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
205 if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
206 !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
207 return false;
208
209 Register HiDestReg = Hi20.getOperand(0).getReg();
210 if (!MRI->hasOneUse(HiDestReg))
211 return false;
212
213 Add = &*MRI->use_instr_begin(HiDestReg);
214 if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
215 (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
216 return false;
217
218 if (Add->getOperand(2).getReg() != LoongArch::R2)
219 return false;
220
221 const MachineOperand &AddOp3 = Add->getOperand(3);
222 if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
223 !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
224 AddOp3.getOffset() != 0)
225 return false;
226
227 Register AddDestReg = Add->getOperand(0).getReg();
228 if (!MRI->hasOneUse(AddDestReg))
229 return false;
230
231 Lo12 = &*MRI->use_instr_begin(AddDestReg);
232 if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
233 (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
234 return false;
235
236 const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
237 if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
238 !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
239 Lo12Op2.getOffset() != 0)
240 return false;
241
242 if (Hi20Op1.isGlobal()) {
243 LLVM_DEBUG(dbgs() << " Found lowered global address: "
244 << *Hi20Op1.getGlobal() << "\n");
245 } else if (Hi20Op1.isCPI()) {
246 LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
247 << "\n");
248 }
249
250 return true;
251 }
252
253 // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
254 // Delete the tail instruction and update all the uses to use the
255 // output from Last.
foldOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last,MachineInstr & Tail,int64_t Offset)256 void LoongArchMergeBaseOffsetOpt::foldOffset(
257 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
258 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
259 int64_t Offset) {
260 // Put the offset back in Hi and the Lo
261 Hi20.getOperand(1).setOffset(Offset);
262 Lo12.getOperand(2).setOffset(Offset);
263 if (Lo20 && Hi12) {
264 Lo20->getOperand(2).setOffset(Offset);
265 Hi12->getOperand(2).setOffset(Offset);
266 }
267
268 // For tls-le, offset of the second PseudoAddTPRel instr should also be
269 // updated.
270 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
271 if (Hi20.getOpcode() == LoongArch::LU12I_W)
272 Add->getOperand(3).setOffset(Offset);
273
274 // Delete the tail instruction.
275 MachineInstr *Def = Last ? Last : &Lo12;
276 MRI->constrainRegClass(Def->getOperand(0).getReg(),
277 MRI->getRegClass(Tail.getOperand(0).getReg()));
278 MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
279 Tail.eraseFromParent();
280
281 LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
282 << " " << Hi20;);
283 if (Hi20.getOpcode() == LoongArch::LU12I_W) {
284 LLVM_DEBUG(dbgs() << " " << *Add;);
285 }
286 LLVM_DEBUG(dbgs() << " " << Lo12;);
287 if (Lo20 && Hi12) {
288 LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
289 }
290 }
291
292 // Detect patterns for large offsets that are passed into an ADD instruction.
293 // If the pattern is found, updates the offset in Hi20, (Add), Lo12,
294 // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
295 // produced the offset.
296 //
297 // (The instructions marked with "!" are not necessarily present)
298 //
299 // Base address lowering is of the form:
300 // 1) pcala:
301 // Hi20: pcalau12i vreg1, %pc_hi20(s)
302 // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
303 // | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
304 // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
305 // |
306 // | 2) tls-le:
307 // | Hi20: lu12i.w vreg1, %le_hi20_r(s)
308 // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
309 // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
310 // |
311 // | The large offset can be one of the forms:
312 // |
313 // +-> 1) Offset that has non zero bits in Hi20 and Lo12 bits:
314 // | OffsetHi20: lu12i.w vreg3, 4
315 // | OffsetLo12: ori voff, vreg3, 188 ------------------+
316 // | |
317 // +-> 2) Offset that has non zero bits in Hi20 bits only: |
318 // | OffsetHi20: lu12i.w voff, 128 ------------------+
319 // | |
320 // +-> 3) Offset that has non zero bits in Lo20 bits: |
321 // | OffsetHi20: lu12i.w vreg3, 121 ! |
322 // | OffsetLo12: ori voff, vreg3, 122 ! |
323 // | OffsetLo20: lu32i.d voff, 123 ------------------+
324 // +-> 4) Offset that has non zero bits in Hi12 bits: |
325 // OffsetHi20: lu12i.w vreg3, 121 ! |
326 // OffsetLo12: ori voff, vreg3, 122 ! |
327 // OffsetLo20: lu32i.d vreg3, 123 ! |
328 // OffsetHi12: lu52i.d voff, vrg3, 124 ------------------+
329 // |
330 // TailAdd: add.d vreg4, vreg2, voff <------------------+
331 //
foldLargeOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last,MachineInstr & TailAdd,Register GAReg)332 bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
333 MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
334 MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
335 Register GAReg) {
336 assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
337 TailAdd.getOpcode() == LoongArch::ADD_D) &&
338 "Expected ADD instruction!");
339 Register Rs = TailAdd.getOperand(1).getReg();
340 Register Rt = TailAdd.getOperand(2).getReg();
341 Register Reg = Rs == GAReg ? Rt : Rs;
342 SmallVector<MachineInstr *, 4> Instrs;
343 int64_t Offset = 0;
344 int64_t Mask = -1;
345
346 // This can point to one of [ORI, LU12I.W, LU32I.D, LU52I.D]:
347 for (int i = 0; i < 4; i++) {
348 // Handle Reg is R0.
349 if (Reg == LoongArch::R0)
350 break;
351
352 // Can't fold if the register has more than one use.
353 if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
354 return false;
355
356 MachineInstr *Curr = MRI->getVRegDef(Reg);
357 if (!Curr)
358 break;
359
360 switch (Curr->getOpcode()) {
361 default:
362 // Can't fold if the instruction opcode is unexpected.
363 return false;
364 case LoongArch::ORI: {
365 MachineOperand ImmOp = Curr->getOperand(2);
366 if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
367 return false;
368 Offset += ImmOp.getImm();
369 Reg = Curr->getOperand(1).getReg();
370 Instrs.push_back(Curr);
371 break;
372 }
373 case LoongArch::LU12I_W: {
374 MachineOperand ImmOp = Curr->getOperand(1);
375 if (ImmOp.getTargetFlags() != LoongArchII::MO_None)
376 return false;
377 Offset += SignExtend64<32>(ImmOp.getImm() << 12) & Mask;
378 Reg = LoongArch::R0;
379 Instrs.push_back(Curr);
380 break;
381 }
382 case LoongArch::LU32I_D: {
383 MachineOperand ImmOp = Curr->getOperand(2);
384 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Lo20)
385 return false;
386 Offset += SignExtend64<52>(ImmOp.getImm() << 32) & Mask;
387 Mask ^= 0x000FFFFF00000000ULL;
388 Reg = Curr->getOperand(1).getReg();
389 Instrs.push_back(Curr);
390 break;
391 }
392 case LoongArch::LU52I_D: {
393 MachineOperand ImmOp = Curr->getOperand(2);
394 if (ImmOp.getTargetFlags() != LoongArchII::MO_None || !Hi12)
395 return false;
396 Offset += ImmOp.getImm() << 52;
397 Mask ^= 0xFFF0000000000000ULL;
398 Reg = Curr->getOperand(1).getReg();
399 Instrs.push_back(Curr);
400 break;
401 }
402 }
403 }
404
405 // Can't fold if the offset is not extracted.
406 if (!Offset)
407 return false;
408
409 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
410 LLVM_DEBUG(dbgs() << " Offset Instrs:\n");
411 for (auto I : Instrs) {
412 LLVM_DEBUG(dbgs() << " " << *I);
413 I->eraseFromParent();
414 }
415
416 return true;
417 }
418
detectAndFoldOffset(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last)419 bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
420 MachineInstr &Lo12,
421 MachineInstr *&Lo20,
422 MachineInstr *&Hi12,
423 MachineInstr *&Last) {
424 Register DestReg =
425 Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
426
427 // Look for arithmetic instructions we can get an offset from.
428 // We might be able to remove the arithmetic instructions by folding the
429 // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
430 // LU12I_W+PseudoAddTPRel+ADDI.
431 if (!MRI->hasOneUse(DestReg))
432 return false;
433
434 // DestReg has only one use.
435 MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
436 switch (Tail.getOpcode()) {
437 default:
438 LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
439 << Tail);
440 break;
441 case LoongArch::ADDI_W:
442 if (ST->is64Bit())
443 return false;
444 [[fallthrough]];
445 case LoongArch::ADDI_D:
446 case LoongArch::ADDU16I_D: {
447 // Offset is simply an immediate operand.
448 int64_t Offset = Tail.getOperand(2).getImm();
449 if (Tail.getOpcode() == LoongArch::ADDU16I_D)
450 Offset = SignExtend64<32>(Offset << 16);
451
452 // We might have two ADDIs in a row.
453 Register TailDestReg = Tail.getOperand(0).getReg();
454 if (MRI->hasOneUse(TailDestReg)) {
455 MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
456 if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
457 return false;
458 if (TailTail.getOpcode() == LoongArch::ADDI_W ||
459 TailTail.getOpcode() == LoongArch::ADDI_D) {
460 Offset += TailTail.getOperand(2).getImm();
461 LLVM_DEBUG(dbgs() << " Offset Instrs: " << Tail << TailTail);
462 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
463 Tail.eraseFromParent();
464 return true;
465 }
466 }
467
468 LLVM_DEBUG(dbgs() << " Offset Instr: " << Tail);
469 foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
470 return true;
471 }
472 case LoongArch::ADD_W:
473 if (ST->is64Bit())
474 return false;
475 [[fallthrough]];
476 case LoongArch::ADD_D:
477 // The offset is too large to fit in the immediate field of ADDI.
478 return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
479 break;
480 }
481
482 return false;
483 }
484
485 // Memory access opcode mapping for transforms.
getNewOpc(unsigned Op,bool isLarge)486 static unsigned getNewOpc(unsigned Op, bool isLarge) {
487 switch (Op) {
488 case LoongArch::LD_B:
489 return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
490 case LoongArch::LD_H:
491 return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
492 case LoongArch::LD_W:
493 case LoongArch::LDPTR_W:
494 return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
495 case LoongArch::LD_D:
496 case LoongArch::LDPTR_D:
497 return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
498 case LoongArch::LD_BU:
499 return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
500 case LoongArch::LD_HU:
501 return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
502 case LoongArch::LD_WU:
503 return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
504 case LoongArch::FLD_S:
505 return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
506 case LoongArch::FLD_D:
507 return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
508 case LoongArch::VLD:
509 return isLarge ? LoongArch::VLDX : LoongArch::VLD;
510 case LoongArch::XVLD:
511 return isLarge ? LoongArch::XVLDX : LoongArch::XVLD;
512 case LoongArch::VLDREPL_B:
513 return LoongArch::VLDREPL_B;
514 case LoongArch::XVLDREPL_B:
515 return LoongArch::XVLDREPL_B;
516 case LoongArch::ST_B:
517 return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
518 case LoongArch::ST_H:
519 return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
520 case LoongArch::ST_W:
521 case LoongArch::STPTR_W:
522 return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
523 case LoongArch::ST_D:
524 case LoongArch::STPTR_D:
525 return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
526 case LoongArch::FST_S:
527 return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
528 case LoongArch::FST_D:
529 return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
530 case LoongArch::VST:
531 return isLarge ? LoongArch::VSTX : LoongArch::VST;
532 case LoongArch::XVST:
533 return isLarge ? LoongArch::XVSTX : LoongArch::XVST;
534 default:
535 llvm_unreachable("Unexpected opcode for replacement");
536 }
537 }
538
foldIntoMemoryOps(MachineInstr & Hi20,MachineInstr & Lo12,MachineInstr * & Lo20,MachineInstr * & Hi12,MachineInstr * & Last)539 bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
540 MachineInstr &Lo12,
541 MachineInstr *&Lo20,
542 MachineInstr *&Hi12,
543 MachineInstr *&Last) {
544 Register DestReg =
545 Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
546
547 // If all the uses are memory ops with the same offset, we can transform:
548 //
549 // 1. (small/medium):
550 // 1.1. pcala
551 // pcalau12i vreg1, %pc_hi20(s)
552 // addi.d vreg2, vreg1, %pc_lo12(s)
553 // ld.w vreg3, 8(vreg2)
554 //
555 // =>
556 //
557 // pcalau12i vreg1, %pc_hi20(s+8)
558 // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
559 //
560 // 1.2. tls-le
561 // lu12i.w vreg1, %le_hi20_r(s)
562 // add.w/d vreg2, vreg1, r2, %le_add_r(s)
563 // addi.w/d vreg3, vreg2, %le_lo12_r(s)
564 // ld.w vreg4, 8(vreg3)
565 //
566 // =>
567 //
568 // lu12i.w vreg1, %le_hi20_r(s+8)
569 // add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
570 // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
571 //
572 // 2. (large):
573 // pcalau12i vreg1, %pc_hi20(s)
574 // addi.d vreg2, $zero, %pc_lo12(s)
575 // lu32i.d vreg3, vreg2, %pc64_lo20(s)
576 // lu52i.d vreg4, vreg3, %pc64_hi12(s)
577 // add.d vreg5, vreg4, vreg1
578 // ld.w vreg6, 8(vreg5)
579 //
580 // =>
581 //
582 // pcalau12i vreg1, %pc_hi20(s+8)
583 // addi.d vreg2, $zero, %pc_lo12(s+8)
584 // lu32i.d vreg3, vreg2, %pc64_lo20(s+8)
585 // lu52i.d vreg4, vreg3, %pc64_hi12(s+8)
586 // ldx.w vreg6, vreg4, vreg1
587
588 std::optional<int64_t> CommonOffset;
589 DenseMap<const MachineInstr *, SmallVector<unsigned>>
590 InlineAsmMemoryOpIndexesMap;
591 for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
592 switch (UseMI.getOpcode()) {
593 default:
594 LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
595 return false;
596 case LoongArch::VLDREPL_B:
597 case LoongArch::XVLDREPL_B:
598 // We can't do this for large pattern.
599 if (Last)
600 return false;
601 [[fallthrough]];
602 case LoongArch::LD_B:
603 case LoongArch::LD_H:
604 case LoongArch::LD_W:
605 case LoongArch::LD_D:
606 case LoongArch::LD_BU:
607 case LoongArch::LD_HU:
608 case LoongArch::LD_WU:
609 case LoongArch::LDPTR_W:
610 case LoongArch::LDPTR_D:
611 case LoongArch::FLD_S:
612 case LoongArch::FLD_D:
613 case LoongArch::VLD:
614 case LoongArch::XVLD:
615 case LoongArch::ST_B:
616 case LoongArch::ST_H:
617 case LoongArch::ST_W:
618 case LoongArch::ST_D:
619 case LoongArch::STPTR_W:
620 case LoongArch::STPTR_D:
621 case LoongArch::FST_S:
622 case LoongArch::FST_D:
623 case LoongArch::VST:
624 case LoongArch::XVST: {
625 if (UseMI.getOperand(1).isFI())
626 return false;
627 // Register defined by Lo should not be the value register.
628 if (DestReg == UseMI.getOperand(0).getReg())
629 return false;
630 assert(DestReg == UseMI.getOperand(1).getReg() &&
631 "Expected base address use");
632 // All load/store instructions must use the same offset.
633 int64_t Offset = UseMI.getOperand(2).getImm();
634 if (CommonOffset && Offset != CommonOffset)
635 return false;
636 CommonOffset = Offset;
637 break;
638 }
639 case LoongArch::INLINEASM:
640 case LoongArch::INLINEASM_BR: {
641 // We can't do this for large pattern.
642 if (Last)
643 return false;
644 SmallVector<unsigned> InlineAsmMemoryOpIndexes;
645 unsigned NumOps = 0;
646 for (unsigned I = InlineAsm::MIOp_FirstOperand;
647 I < UseMI.getNumOperands(); I += 1 + NumOps) {
648 const MachineOperand &FlagsMO = UseMI.getOperand(I);
649 // Should be an imm.
650 if (!FlagsMO.isImm())
651 continue;
652
653 const InlineAsm::Flag Flags(FlagsMO.getImm());
654 NumOps = Flags.getNumOperandRegisters();
655
656 // Memory constraints have two operands.
657 if (NumOps != 2 || !Flags.isMemKind()) {
658 // If the register is used by something other than a memory contraint,
659 // we should not fold.
660 for (unsigned J = 0; J < NumOps; ++J) {
661 const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
662 if (MO.isReg() && MO.getReg() == DestReg)
663 return false;
664 }
665 continue;
666 }
667
668 // We can only do this for constraint m.
669 if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
670 return false;
671
672 const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
673 if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
674 continue;
675
676 const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
677 if (!OffsetMO.isImm())
678 continue;
679
680 // All inline asm memory operands must use the same offset.
681 int64_t Offset = OffsetMO.getImm();
682 if (CommonOffset && Offset != CommonOffset)
683 return false;
684 CommonOffset = Offset;
685 InlineAsmMemoryOpIndexes.push_back(I + 1);
686 }
687 InlineAsmMemoryOpIndexesMap.insert(
688 std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
689 break;
690 }
691 }
692 }
693
694 // We found a common offset.
695 // Update the offsets in global address lowering.
696 // We may have already folded some arithmetic so we need to add to any
697 // existing offset.
698 int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
699 // LA32 ignores the upper 32 bits.
700 if (!ST->is64Bit())
701 NewOffset = SignExtend64<32>(NewOffset);
702 // We can only fold simm32 offsets.
703 if (!isInt<32>(NewOffset))
704 return false;
705
706 // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
707 // be removed from the pcala code sequence. Code sequence of tls-le can still
708 // be relaxed after being optimized.
709 //
710 // For example:
711 // pcalau12i $a0, %pc_hi20(symbol)
712 // addi.d $a0, $a0, %pc_lo12(symbol)
713 // ld.w $a0, $a0, 0
714 //
715 // =>
716 //
717 // pcalau12i $a0, %pc_hi20(symbol)
718 // ld.w $a0, $a0, %pc_lo12(symbol)
719 //
720 // Code sequence optimized before can be relax by linker. But after being
721 // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
722 // carried by them.
723 Hi20.getOperand(1).setOffset(NewOffset);
724 MachineOperand &ImmOp = Lo12.getOperand(2);
725 ImmOp.setOffset(NewOffset);
726 if (Lo20 && Hi12) {
727 Lo20->getOperand(2).setOffset(NewOffset);
728 Hi12->getOperand(2).setOffset(NewOffset);
729 }
730 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
731 Hi20.getOperand(1).setTargetFlags(
732 LoongArchII::getDirectFlags(Hi20.getOperand(1)));
733 ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
734 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
735 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
736 Add->getOperand(3).setOffset(NewOffset);
737 }
738
739 // Update the immediate in the load/store instructions to add the offset.
740 const LoongArchInstrInfo &TII = *ST->getInstrInfo();
741 for (MachineInstr &UseMI :
742 llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
743 if (UseMI.getOpcode() == LoongArch::INLINEASM ||
744 UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
745 auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
746 for (unsigned I : InlineAsmMemoryOpIndexes) {
747 MachineOperand &MO = UseMI.getOperand(I + 1);
748 switch (ImmOp.getType()) {
749 case MachineOperand::MO_GlobalAddress:
750 MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
751 LoongArchII::getDirectFlags(ImmOp));
752 break;
753 case MachineOperand::MO_MCSymbol:
754 MO.ChangeToMCSymbol(ImmOp.getMCSymbol(),
755 LoongArchII::getDirectFlags(ImmOp));
756 MO.setOffset(ImmOp.getOffset());
757 break;
758 case MachineOperand::MO_BlockAddress:
759 MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
760 LoongArchII::getDirectFlags(ImmOp));
761 break;
762 case MachineOperand::MO_ConstantPoolIndex:
763 MO.ChangeToCPI(ImmOp.getIndex(), ImmOp.getOffset(),
764 LoongArchII::getDirectFlags(ImmOp));
765 break;
766 default:
767 report_fatal_error("unsupported machine operand type");
768 break;
769 }
770 }
771 } else {
772 UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
773 if (Last) {
774 UseMI.removeOperand(2);
775 UseMI.removeOperand(1);
776 UseMI.addOperand(Last->getOperand(1));
777 UseMI.addOperand(Last->getOperand(2));
778 UseMI.getOperand(1).setIsKill(false);
779 UseMI.getOperand(2).setIsKill(false);
780 } else {
781 UseMI.removeOperand(2);
782 UseMI.addOperand(ImmOp);
783 }
784 }
785 }
786
787 if (Last) {
788 Last->eraseFromParent();
789 return true;
790 }
791
792 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
793 MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
794 Hi20.getOperand(0).getReg());
795 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
796 MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
797 MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
798 Add->getOperand(0).getReg());
799 }
800 Lo12.eraseFromParent();
801 return true;
802 }
803
runOnMachineFunction(MachineFunction & Fn)804 bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
805 if (skipFunction(Fn.getFunction()))
806 return false;
807
808 ST = &Fn.getSubtarget<LoongArchSubtarget>();
809
810 bool MadeChange = false;
811 MRI = &Fn.getRegInfo();
812 for (MachineBasicBlock &MBB : Fn) {
813 LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
814 for (MachineInstr &Hi20 : MBB) {
815 MachineInstr *Lo12 = nullptr;
816 MachineInstr *Lo20 = nullptr;
817 MachineInstr *Hi12 = nullptr;
818 MachineInstr *Last = nullptr;
819 if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
820 // Detect foldable pcala code sequence in small/medium/large code model.
821 if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
822 continue;
823 } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
824 MachineInstr *Add = nullptr;
825 // Detect foldable tls-le code sequence in small/medium code model.
826 if (!detectFoldable(Hi20, Add, Lo12))
827 continue;
828 } else {
829 continue;
830 }
831 // For tls-le, we do not pass the second PseudoAddTPRel instr in order to
832 // reuse the existing hooks and the last three paramaters should always be
833 // nullptr.
834 MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
835 MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
836 }
837 }
838
839 return MadeChange;
840 }
841
842 /// Returns an instance of the Merge Base Offset Optimization pass.
createLoongArchMergeBaseOffsetOptPass()843 FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
844 return new LoongArchMergeBaseOffsetOpt();
845 }
846