xref: /freebsd/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchOptWInstrs.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- LoongArchOptWInstrs.cpp - MI W instruction optimizations ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This pass does some optimizations for *W instructions at the MI level.
10 //
11 // First it removes unneeded sext(addi.w rd, rs, 0) instructions. Either
12 // because the sign extended bits aren't consumed or because the input was
13 // already sign extended by an earlier instruction.
14 //
15 // Then:
16 // 1. Unless explicit disabled or the target prefers instructions with W suffix,
17 //    it removes the -w suffix from opw instructions whenever all users are
18 //    dependent only on the lower word of the result of the instruction.
19 //    The cases handled are:
20 //    * addi.w because it helps reduce test differences between LA32 and LA64
21 //      w/o being a pessimization.
22 //
23 // 2. Or if explicit enabled or the target prefers instructions with W suffix,
24 //    it adds the W suffix to the instruction whenever all users are dependent
25 //    only on the lower word of the result of the instruction.
26 //    The cases handled are:
27 //    * add.d/addi.d/sub.d/mul.d.
28 //    * slli.d with imm < 32.
29 //    * ld.d/ld.wu.
30 //===---------------------------------------------------------------------===//
31 
32 #include "LoongArch.h"
33 #include "LoongArchMachineFunctionInfo.h"
34 #include "LoongArchSubtarget.h"
35 #include "llvm/ADT/SmallSet.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/CodeGen/MachineFunctionPass.h"
38 #include "llvm/CodeGen/TargetInstrInfo.h"
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "loongarch-opt-w-instrs"
43 #define LOONGARCH_OPT_W_INSTRS_NAME "LoongArch Optimize W Instructions"
44 
45 STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
46 STATISTIC(NumTransformedToWInstrs,
47           "Number of instructions transformed to W-ops");
48 
49 static cl::opt<bool>
50     DisableSExtWRemoval("loongarch-disable-sextw-removal",
51                         cl::desc("Disable removal of sign-extend insn"),
52                         cl::init(false), cl::Hidden);
53 static cl::opt<bool>
54     DisableCvtToDSuffix("loongarch-disable-cvt-to-d-suffix",
55                         cl::desc("Disable convert to D suffix"),
56                         cl::init(false), cl::Hidden);
57 
58 namespace {
59 
60 class LoongArchOptWInstrs : public MachineFunctionPass {
61 public:
62   static char ID;
63 
64   LoongArchOptWInstrs() : MachineFunctionPass(ID) {}
65 
66   bool runOnMachineFunction(MachineFunction &MF) override;
67   bool removeSExtWInstrs(MachineFunction &MF, const LoongArchInstrInfo &TII,
68                          const LoongArchSubtarget &ST,
69                          MachineRegisterInfo &MRI);
70   bool convertToDSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII,
71                           const LoongArchSubtarget &ST,
72                           MachineRegisterInfo &MRI);
73   bool convertToWSuffixes(MachineFunction &MF, const LoongArchInstrInfo &TII,
74                           const LoongArchSubtarget &ST,
75                           MachineRegisterInfo &MRI);
76 
77   void getAnalysisUsage(AnalysisUsage &AU) const override {
78     AU.setPreservesCFG();
79     MachineFunctionPass::getAnalysisUsage(AU);
80   }
81 
82   StringRef getPassName() const override { return LOONGARCH_OPT_W_INSTRS_NAME; }
83 };
84 
85 } // end anonymous namespace
86 
87 char LoongArchOptWInstrs::ID = 0;
88 INITIALIZE_PASS(LoongArchOptWInstrs, DEBUG_TYPE, LOONGARCH_OPT_W_INSTRS_NAME,
89                 false, false)
90 
91 FunctionPass *llvm::createLoongArchOptWInstrsPass() {
92   return new LoongArchOptWInstrs();
93 }
94 
95 // Checks if all users only demand the lower \p OrigBits of the original
96 // instruction's result.
97 // TODO: handle multiple interdependent transformations
98 static bool hasAllNBitUsers(const MachineInstr &OrigMI,
99                             const LoongArchSubtarget &ST,
100                             const MachineRegisterInfo &MRI, unsigned OrigBits) {
101 
102   SmallSet<std::pair<const MachineInstr *, unsigned>, 4> Visited;
103   SmallVector<std::pair<const MachineInstr *, unsigned>, 4> Worklist;
104 
105   Worklist.push_back(std::make_pair(&OrigMI, OrigBits));
106 
107   while (!Worklist.empty()) {
108     auto P = Worklist.pop_back_val();
109     const MachineInstr *MI = P.first;
110     unsigned Bits = P.second;
111 
112     if (!Visited.insert(P).second)
113       continue;
114 
115     // Only handle instructions with one def.
116     if (MI->getNumExplicitDefs() != 1)
117       return false;
118 
119     Register DestReg = MI->getOperand(0).getReg();
120     if (!DestReg.isVirtual())
121       return false;
122 
123     for (auto &UserOp : MRI.use_nodbg_operands(DestReg)) {
124       const MachineInstr *UserMI = UserOp.getParent();
125       unsigned OpIdx = UserOp.getOperandNo();
126 
127       switch (UserMI->getOpcode()) {
128       default:
129         // TODO: Add vector
130         return false;
131 
132       case LoongArch::ADD_W:
133       case LoongArch::ADDI_W:
134       case LoongArch::SUB_W:
135       case LoongArch::ALSL_W:
136       case LoongArch::ALSL_WU:
137       case LoongArch::MUL_W:
138       case LoongArch::MULH_W:
139       case LoongArch::MULH_WU:
140       case LoongArch::MULW_D_W:
141       case LoongArch::MULW_D_WU:
142       // TODO: {DIV,MOD}.{W,WU} consumes the upper 32 bits before LA664+.
143       // case LoongArch::DIV_W:
144       // case LoongArch::DIV_WU:
145       // case LoongArch::MOD_W:
146       // case LoongArch::MOD_WU:
147       case LoongArch::SLL_W:
148       case LoongArch::SLLI_W:
149       case LoongArch::SRL_W:
150       case LoongArch::SRLI_W:
151       case LoongArch::SRA_W:
152       case LoongArch::SRAI_W:
153       case LoongArch::ROTR_W:
154       case LoongArch::ROTRI_W:
155       case LoongArch::CLO_W:
156       case LoongArch::CLZ_W:
157       case LoongArch::CTO_W:
158       case LoongArch::CTZ_W:
159       case LoongArch::BYTEPICK_W:
160       case LoongArch::REVB_2H:
161       case LoongArch::BITREV_4B:
162       case LoongArch::BITREV_W:
163       case LoongArch::BSTRINS_W:
164       case LoongArch::BSTRPICK_W:
165       case LoongArch::CRC_W_W_W:
166       case LoongArch::CRCC_W_W_W:
167       case LoongArch::MOVGR2FCSR:
168       case LoongArch::MOVGR2FRH_W:
169       case LoongArch::MOVGR2FR_W_64:
170         if (Bits >= 32)
171           break;
172         return false;
173       case LoongArch::MOVGR2CF:
174         if (Bits >= 1)
175           break;
176         return false;
177       case LoongArch::EXT_W_B:
178         if (Bits >= 8)
179           break;
180         return false;
181       case LoongArch::EXT_W_H:
182         if (Bits >= 16)
183           break;
184         return false;
185 
186       case LoongArch::SRLI_D: {
187         // If we are shifting right by less than Bits, and users don't demand
188         // any bits that were shifted into [Bits-1:0], then we can consider this
189         // as an N-Bit user.
190         unsigned ShAmt = UserMI->getOperand(2).getImm();
191         if (Bits > ShAmt) {
192           Worklist.push_back(std::make_pair(UserMI, Bits - ShAmt));
193           break;
194         }
195         return false;
196       }
197 
198       // these overwrite higher input bits, otherwise the lower word of output
199       // depends only on the lower word of input. So check their uses read W.
200       case LoongArch::SLLI_D:
201         if (Bits >= (ST.getGRLen() - UserMI->getOperand(2).getImm()))
202           break;
203         Worklist.push_back(std::make_pair(UserMI, Bits));
204         break;
205       case LoongArch::ANDI: {
206         uint64_t Imm = UserMI->getOperand(2).getImm();
207         if (Bits >= (unsigned)llvm::bit_width(Imm))
208           break;
209         Worklist.push_back(std::make_pair(UserMI, Bits));
210         break;
211       }
212       case LoongArch::ORI: {
213         uint64_t Imm = UserMI->getOperand(2).getImm();
214         if (Bits >= (unsigned)llvm::bit_width<uint64_t>(~Imm))
215           break;
216         Worklist.push_back(std::make_pair(UserMI, Bits));
217         break;
218       }
219 
220       case LoongArch::SLL_D:
221         // Operand 2 is the shift amount which uses log2(grlen) bits.
222         if (OpIdx == 2) {
223           if (Bits >= Log2_32(ST.getGRLen()))
224             break;
225           return false;
226         }
227         Worklist.push_back(std::make_pair(UserMI, Bits));
228         break;
229 
230       case LoongArch::SRA_D:
231       case LoongArch::SRL_D:
232       case LoongArch::ROTR_D:
233         // Operand 2 is the shift amount which uses 6 bits.
234         if (OpIdx == 2 && Bits >= Log2_32(ST.getGRLen()))
235           break;
236         return false;
237 
238       case LoongArch::ST_B:
239       case LoongArch::STX_B:
240       case LoongArch::STGT_B:
241       case LoongArch::STLE_B:
242       case LoongArch::IOCSRWR_B:
243         // The first argument is the value to store.
244         if (OpIdx == 0 && Bits >= 8)
245           break;
246         return false;
247       case LoongArch::ST_H:
248       case LoongArch::STX_H:
249       case LoongArch::STGT_H:
250       case LoongArch::STLE_H:
251       case LoongArch::IOCSRWR_H:
252         // The first argument is the value to store.
253         if (OpIdx == 0 && Bits >= 16)
254           break;
255         return false;
256       case LoongArch::ST_W:
257       case LoongArch::STX_W:
258       case LoongArch::SCREL_W:
259       case LoongArch::STPTR_W:
260       case LoongArch::STGT_W:
261       case LoongArch::STLE_W:
262       case LoongArch::IOCSRWR_W:
263         // The first argument is the value to store.
264         if (OpIdx == 0 && Bits >= 32)
265           break;
266         return false;
267 
268       case LoongArch::CRC_W_B_W:
269       case LoongArch::CRCC_W_B_W:
270         if ((OpIdx == 1 && Bits >= 8) || (OpIdx == 2 && Bits >= 32))
271           break;
272         return false;
273       case LoongArch::CRC_W_H_W:
274       case LoongArch::CRCC_W_H_W:
275         if ((OpIdx == 1 && Bits >= 16) || (OpIdx == 2 && Bits >= 32))
276           break;
277         return false;
278       case LoongArch::CRC_W_D_W:
279       case LoongArch::CRCC_W_D_W:
280         if (OpIdx == 2 && Bits >= 32)
281           break;
282         return false;
283 
284       // For these, lower word of output in these operations, depends only on
285       // the lower word of input. So, we check all uses only read lower word.
286       case LoongArch::COPY:
287       case LoongArch::PHI:
288       case LoongArch::ADD_D:
289       case LoongArch::ADDI_D:
290       case LoongArch::SUB_D:
291       case LoongArch::MUL_D:
292       case LoongArch::AND:
293       case LoongArch::OR:
294       case LoongArch::NOR:
295       case LoongArch::XOR:
296       case LoongArch::XORI:
297       case LoongArch::ANDN:
298       case LoongArch::ORN:
299         Worklist.push_back(std::make_pair(UserMI, Bits));
300         break;
301 
302       case LoongArch::MASKNEZ:
303       case LoongArch::MASKEQZ:
304         if (OpIdx != 1)
305           return false;
306         Worklist.push_back(std::make_pair(UserMI, Bits));
307         break;
308       }
309     }
310   }
311 
312   return true;
313 }
314 
315 static bool hasAllWUsers(const MachineInstr &OrigMI,
316                          const LoongArchSubtarget &ST,
317                          const MachineRegisterInfo &MRI) {
318   return hasAllNBitUsers(OrigMI, ST, MRI, 32);
319 }
320 
321 // This function returns true if the machine instruction always outputs a value
322 // where bits 63:32 match bit 31.
323 static bool isSignExtendingOpW(const MachineInstr &MI,
324                                const MachineRegisterInfo &MRI, unsigned OpNo) {
325   switch (MI.getOpcode()) {
326   // Normal cases
327   case LoongArch::ADD_W:
328   case LoongArch::SUB_W:
329   case LoongArch::ADDI_W:
330   case LoongArch::ALSL_W:
331   case LoongArch::LU12I_W:
332   case LoongArch::SLT:
333   case LoongArch::SLTU:
334   case LoongArch::SLTI:
335   case LoongArch::SLTUI:
336   case LoongArch::ANDI:
337   case LoongArch::MUL_W:
338   case LoongArch::MULH_W:
339   case LoongArch::MULH_WU:
340   case LoongArch::DIV_W:
341   case LoongArch::MOD_W:
342   case LoongArch::DIV_WU:
343   case LoongArch::MOD_WU:
344   case LoongArch::SLL_W:
345   case LoongArch::SRL_W:
346   case LoongArch::SRA_W:
347   case LoongArch::ROTR_W:
348   case LoongArch::SLLI_W:
349   case LoongArch::SRLI_W:
350   case LoongArch::SRAI_W:
351   case LoongArch::ROTRI_W:
352   case LoongArch::EXT_W_B:
353   case LoongArch::EXT_W_H:
354   case LoongArch::CLO_W:
355   case LoongArch::CLZ_W:
356   case LoongArch::CTO_W:
357   case LoongArch::CTZ_W:
358   case LoongArch::BYTEPICK_W:
359   case LoongArch::REVB_2H:
360   case LoongArch::BITREV_4B:
361   case LoongArch::BITREV_W:
362   case LoongArch::BSTRINS_W:
363   case LoongArch::BSTRPICK_W:
364   case LoongArch::LD_B:
365   case LoongArch::LD_H:
366   case LoongArch::LD_W:
367   case LoongArch::LD_BU:
368   case LoongArch::LD_HU:
369   case LoongArch::LL_W:
370   case LoongArch::LLACQ_W:
371   case LoongArch::RDTIMEL_W:
372   case LoongArch::RDTIMEH_W:
373   case LoongArch::CPUCFG:
374   case LoongArch::LDX_B:
375   case LoongArch::LDX_H:
376   case LoongArch::LDX_W:
377   case LoongArch::LDX_BU:
378   case LoongArch::LDX_HU:
379   case LoongArch::LDPTR_W:
380   case LoongArch::LDGT_B:
381   case LoongArch::LDGT_H:
382   case LoongArch::LDGT_W:
383   case LoongArch::LDLE_B:
384   case LoongArch::LDLE_H:
385   case LoongArch::LDLE_W:
386   case LoongArch::AMSWAP_B:
387   case LoongArch::AMSWAP_H:
388   case LoongArch::AMSWAP_W:
389   case LoongArch::AMADD_B:
390   case LoongArch::AMADD_H:
391   case LoongArch::AMADD_W:
392   case LoongArch::AMAND_W:
393   case LoongArch::AMOR_W:
394   case LoongArch::AMXOR_W:
395   case LoongArch::AMMAX_W:
396   case LoongArch::AMMIN_W:
397   case LoongArch::AMMAX_WU:
398   case LoongArch::AMMIN_WU:
399   case LoongArch::AMSWAP__DB_B:
400   case LoongArch::AMSWAP__DB_H:
401   case LoongArch::AMSWAP__DB_W:
402   case LoongArch::AMADD__DB_B:
403   case LoongArch::AMADD__DB_H:
404   case LoongArch::AMADD__DB_W:
405   case LoongArch::AMAND__DB_W:
406   case LoongArch::AMOR__DB_W:
407   case LoongArch::AMXOR__DB_W:
408   case LoongArch::AMMAX__DB_W:
409   case LoongArch::AMMIN__DB_W:
410   case LoongArch::AMMAX__DB_WU:
411   case LoongArch::AMMIN__DB_WU:
412   case LoongArch::AMCAS_B:
413   case LoongArch::AMCAS_H:
414   case LoongArch::AMCAS_W:
415   case LoongArch::AMCAS__DB_B:
416   case LoongArch::AMCAS__DB_H:
417   case LoongArch::AMCAS__DB_W:
418   case LoongArch::CRC_W_B_W:
419   case LoongArch::CRC_W_H_W:
420   case LoongArch::CRC_W_W_W:
421   case LoongArch::CRC_W_D_W:
422   case LoongArch::CRCC_W_B_W:
423   case LoongArch::CRCC_W_H_W:
424   case LoongArch::CRCC_W_W_W:
425   case LoongArch::CRCC_W_D_W:
426   case LoongArch::IOCSRRD_B:
427   case LoongArch::IOCSRRD_H:
428   case LoongArch::IOCSRRD_W:
429   case LoongArch::MOVFR2GR_S:
430   case LoongArch::MOVFCSR2GR:
431   case LoongArch::MOVCF2GR:
432   case LoongArch::MOVFRH2GR_S:
433   case LoongArch::MOVFR2GR_S_64:
434     // TODO: Add vector
435     return true;
436   // Special cases that require checking operands.
437   // shifting right sufficiently makes the value 32-bit sign-extended
438   case LoongArch::SRAI_D:
439     return MI.getOperand(2).getImm() >= 32;
440   case LoongArch::SRLI_D:
441     return MI.getOperand(2).getImm() > 32;
442   // The LI pattern ADDI rd, R0, imm and ORI rd, R0, imm are sign extended.
443   case LoongArch::ADDI_D:
444   case LoongArch::ORI:
445     return MI.getOperand(1).isReg() &&
446            MI.getOperand(1).getReg() == LoongArch::R0;
447   // A bits extract is sign extended if the msb is less than 31.
448   case LoongArch::BSTRPICK_D:
449     return MI.getOperand(2).getImm() < 31;
450   // Copying from R0 produces zero.
451   case LoongArch::COPY:
452     return MI.getOperand(1).getReg() == LoongArch::R0;
453   // Ignore the scratch register destination.
454   case LoongArch::PseudoMaskedAtomicSwap32:
455   case LoongArch::PseudoAtomicSwap32:
456   case LoongArch::PseudoMaskedAtomicLoadAdd32:
457   case LoongArch::PseudoMaskedAtomicLoadSub32:
458   case LoongArch::PseudoAtomicLoadNand32:
459   case LoongArch::PseudoMaskedAtomicLoadNand32:
460   case LoongArch::PseudoAtomicLoadAdd32:
461   case LoongArch::PseudoAtomicLoadSub32:
462   case LoongArch::PseudoAtomicLoadAnd32:
463   case LoongArch::PseudoAtomicLoadOr32:
464   case LoongArch::PseudoAtomicLoadXor32:
465   case LoongArch::PseudoMaskedAtomicLoadUMax32:
466   case LoongArch::PseudoMaskedAtomicLoadUMin32:
467   case LoongArch::PseudoCmpXchg32:
468   case LoongArch::PseudoMaskedCmpXchg32:
469   case LoongArch::PseudoMaskedAtomicLoadMax32:
470   case LoongArch::PseudoMaskedAtomicLoadMin32:
471     return OpNo == 0;
472   }
473 
474   return false;
475 }
476 
477 static bool isSignExtendedW(Register SrcReg, const LoongArchSubtarget &ST,
478                             const MachineRegisterInfo &MRI,
479                             SmallPtrSetImpl<MachineInstr *> &FixableDef) {
480   SmallSet<Register, 4> Visited;
481   SmallVector<Register, 4> Worklist;
482 
483   auto AddRegToWorkList = [&](Register SrcReg) {
484     if (!SrcReg.isVirtual())
485       return false;
486     Worklist.push_back(SrcReg);
487     return true;
488   };
489 
490   if (!AddRegToWorkList(SrcReg))
491     return false;
492 
493   while (!Worklist.empty()) {
494     Register Reg = Worklist.pop_back_val();
495 
496     // If we already visited this register, we don't need to check it again.
497     if (!Visited.insert(Reg).second)
498       continue;
499 
500     MachineInstr *MI = MRI.getVRegDef(Reg);
501     if (!MI)
502       continue;
503 
504     int OpNo = MI->findRegisterDefOperandIdx(Reg, /*TRI=*/nullptr);
505     assert(OpNo != -1 && "Couldn't find register");
506 
507     // If this is a sign extending operation we don't need to look any further.
508     if (isSignExtendingOpW(*MI, MRI, OpNo))
509       continue;
510 
511     // Is this an instruction that propagates sign extend?
512     switch (MI->getOpcode()) {
513     default:
514       // Unknown opcode, give up.
515       return false;
516     case LoongArch::COPY: {
517       const MachineFunction *MF = MI->getMF();
518       const LoongArchMachineFunctionInfo *LAFI =
519           MF->getInfo<LoongArchMachineFunctionInfo>();
520 
521       // If this is the entry block and the register is livein, see if we know
522       // it is sign extended.
523       if (MI->getParent() == &MF->front()) {
524         Register VReg = MI->getOperand(0).getReg();
525         if (MF->getRegInfo().isLiveIn(VReg) && LAFI->isSExt32Register(VReg))
526           continue;
527       }
528 
529       Register CopySrcReg = MI->getOperand(1).getReg();
530       if (CopySrcReg == LoongArch::R4) {
531         // For a method return value, we check the ZExt/SExt flags in attribute.
532         // We assume the following code sequence for method call.
533         // PseudoCALL @bar, ...
534         // ADJCALLSTACKUP 0, 0, implicit-def dead $r3, implicit $r3
535         // %0:gpr = COPY $r4
536         //
537         // We use the PseudoCall to look up the IR function being called to find
538         // its return attributes.
539         const MachineBasicBlock *MBB = MI->getParent();
540         auto II = MI->getIterator();
541         if (II == MBB->instr_begin() ||
542             (--II)->getOpcode() != LoongArch::ADJCALLSTACKUP)
543           return false;
544 
545         const MachineInstr &CallMI = *(--II);
546         if (!CallMI.isCall() || !CallMI.getOperand(0).isGlobal())
547           return false;
548 
549         auto *CalleeFn =
550             dyn_cast_if_present<Function>(CallMI.getOperand(0).getGlobal());
551         if (!CalleeFn)
552           return false;
553 
554         auto *IntTy = dyn_cast<IntegerType>(CalleeFn->getReturnType());
555         if (!IntTy)
556           return false;
557 
558         const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs();
559         unsigned BitWidth = IntTy->getBitWidth();
560         if ((BitWidth <= 32 && Attrs.hasAttribute(Attribute::SExt)) ||
561             (BitWidth < 32 && Attrs.hasAttribute(Attribute::ZExt)))
562           continue;
563       }
564 
565       if (!AddRegToWorkList(CopySrcReg))
566         return false;
567 
568       break;
569     }
570 
571     // For these, we just need to check if the 1st operand is sign extended.
572     case LoongArch::MOD_D:
573     case LoongArch::ANDI:
574     case LoongArch::ORI:
575     case LoongArch::XORI:
576       // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
577       // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
578       // Logical operations use a sign extended 12-bit immediate.
579       if (!AddRegToWorkList(MI->getOperand(1).getReg()))
580         return false;
581 
582       break;
583     case LoongArch::MOD_DU:
584     case LoongArch::AND:
585     case LoongArch::OR:
586     case LoongArch::XOR:
587     case LoongArch::ANDN:
588     case LoongArch::ORN:
589     case LoongArch::PHI: {
590       // If all incoming values are sign-extended, the output of AND, OR, XOR,
591       // or PHI is also sign-extended.
592 
593       // The input registers for PHI are operand 1, 3, ...
594       // The input registers for others are operand 1 and 2.
595       unsigned B = 1, E = 3, D = 1;
596       switch (MI->getOpcode()) {
597       case LoongArch::PHI:
598         E = MI->getNumOperands();
599         D = 2;
600         break;
601       }
602 
603       for (unsigned I = B; I != E; I += D) {
604         if (!MI->getOperand(I).isReg())
605           return false;
606 
607         if (!AddRegToWorkList(MI->getOperand(I).getReg()))
608           return false;
609       }
610 
611       break;
612     }
613 
614     case LoongArch::MASKEQZ:
615     case LoongArch::MASKNEZ:
616       // Instructions return zero or operand 1. Result is sign extended if
617       // operand 1 is sign extended.
618       if (!AddRegToWorkList(MI->getOperand(1).getReg()))
619         return false;
620       break;
621 
622     // With these opcode, we can "fix" them with the W-version
623     // if we know all users of the result only rely on bits 31:0
624     case LoongArch::SLLI_D:
625       // SLLI_W reads the lowest 5 bits, while SLLI_D reads lowest 6 bits
626       if (MI->getOperand(2).getImm() >= 32)
627         return false;
628       [[fallthrough]];
629     case LoongArch::ADDI_D:
630     case LoongArch::ADD_D:
631     case LoongArch::LD_D:
632     case LoongArch::LD_WU:
633     case LoongArch::MUL_D:
634     case LoongArch::SUB_D:
635       if (hasAllWUsers(*MI, ST, MRI)) {
636         FixableDef.insert(MI);
637         break;
638       }
639       return false;
640     // If all incoming values are sign-extended and all users only use
641     // the lower 32 bits, then convert them to W versions.
642     case LoongArch::DIV_D: {
643       if (!AddRegToWorkList(MI->getOperand(1).getReg()))
644         return false;
645       if (!AddRegToWorkList(MI->getOperand(2).getReg()))
646         return false;
647       if (hasAllWUsers(*MI, ST, MRI)) {
648         FixableDef.insert(MI);
649         break;
650       }
651       return false;
652     }
653     }
654   }
655 
656   // If we get here, then every node we visited produces a sign extended value
657   // or propagated sign extended values. So the result must be sign extended.
658   return true;
659 }
660 
661 static unsigned getWOp(unsigned Opcode) {
662   switch (Opcode) {
663   case LoongArch::ADDI_D:
664     return LoongArch::ADDI_W;
665   case LoongArch::ADD_D:
666     return LoongArch::ADD_W;
667   case LoongArch::DIV_D:
668     return LoongArch::DIV_W;
669   case LoongArch::LD_D:
670   case LoongArch::LD_WU:
671     return LoongArch::LD_W;
672   case LoongArch::MUL_D:
673     return LoongArch::MUL_W;
674   case LoongArch::SLLI_D:
675     return LoongArch::SLLI_W;
676   case LoongArch::SUB_D:
677     return LoongArch::SUB_W;
678   default:
679     llvm_unreachable("Unexpected opcode for replacement with W variant");
680   }
681 }
682 
683 bool LoongArchOptWInstrs::removeSExtWInstrs(MachineFunction &MF,
684                                             const LoongArchInstrInfo &TII,
685                                             const LoongArchSubtarget &ST,
686                                             MachineRegisterInfo &MRI) {
687   if (DisableSExtWRemoval)
688     return false;
689 
690   bool MadeChange = false;
691   for (MachineBasicBlock &MBB : MF) {
692     for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
693       // We're looking for the sext.w pattern ADDI.W rd, rs, 0.
694       if (!LoongArch::isSEXT_W(MI))
695         continue;
696 
697       Register SrcReg = MI.getOperand(1).getReg();
698 
699       SmallPtrSet<MachineInstr *, 4> FixableDefs;
700 
701       // If all users only use the lower bits, this sext.w is redundant.
702       // Or if all definitions reaching MI sign-extend their output,
703       // then sext.w is redundant.
704       if (!hasAllWUsers(MI, ST, MRI) &&
705           !isSignExtendedW(SrcReg, ST, MRI, FixableDefs))
706         continue;
707 
708       Register DstReg = MI.getOperand(0).getReg();
709       if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
710         continue;
711 
712       // Convert Fixable instructions to their W versions.
713       for (MachineInstr *Fixable : FixableDefs) {
714         LLVM_DEBUG(dbgs() << "Replacing " << *Fixable);
715         Fixable->setDesc(TII.get(getWOp(Fixable->getOpcode())));
716         Fixable->clearFlag(MachineInstr::MIFlag::NoSWrap);
717         Fixable->clearFlag(MachineInstr::MIFlag::NoUWrap);
718         Fixable->clearFlag(MachineInstr::MIFlag::IsExact);
719         LLVM_DEBUG(dbgs() << "     with " << *Fixable);
720         ++NumTransformedToWInstrs;
721       }
722 
723       LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
724       MRI.replaceRegWith(DstReg, SrcReg);
725       MRI.clearKillFlags(SrcReg);
726       MI.eraseFromParent();
727       ++NumRemovedSExtW;
728       MadeChange = true;
729     }
730   }
731 
732   return MadeChange;
733 }
734 
735 bool LoongArchOptWInstrs::convertToDSuffixes(MachineFunction &MF,
736                                              const LoongArchInstrInfo &TII,
737                                              const LoongArchSubtarget &ST,
738                                              MachineRegisterInfo &MRI) {
739   bool MadeChange = false;
740   for (MachineBasicBlock &MBB : MF) {
741     for (MachineInstr &MI : MBB) {
742       unsigned Opc;
743       switch (MI.getOpcode()) {
744       default:
745         continue;
746       case LoongArch::ADDI_W:
747         Opc = LoongArch::ADDI_D;
748         break;
749       }
750 
751       if (hasAllWUsers(MI, ST, MRI)) {
752         MI.setDesc(TII.get(Opc));
753         MadeChange = true;
754       }
755     }
756   }
757 
758   return MadeChange;
759 }
760 
761 bool LoongArchOptWInstrs::convertToWSuffixes(MachineFunction &MF,
762                                              const LoongArchInstrInfo &TII,
763                                              const LoongArchSubtarget &ST,
764                                              MachineRegisterInfo &MRI) {
765   bool MadeChange = false;
766   for (MachineBasicBlock &MBB : MF) {
767     for (MachineInstr &MI : MBB) {
768       unsigned WOpc;
769       // TODO: Add more?
770       switch (MI.getOpcode()) {
771       default:
772         continue;
773       case LoongArch::ADD_D:
774         WOpc = LoongArch::ADD_W;
775         break;
776       case LoongArch::ADDI_D:
777         WOpc = LoongArch::ADDI_W;
778         break;
779       case LoongArch::SUB_D:
780         WOpc = LoongArch::SUB_W;
781         break;
782       case LoongArch::MUL_D:
783         WOpc = LoongArch::MUL_W;
784         break;
785       case LoongArch::SLLI_D:
786         // SLLI.W reads the lowest 5 bits, while SLLI.D reads lowest 6 bits
787         if (MI.getOperand(2).getImm() >= 32)
788           continue;
789         WOpc = LoongArch::SLLI_W;
790         break;
791       case LoongArch::LD_D:
792       case LoongArch::LD_WU:
793         WOpc = LoongArch::LD_W;
794         break;
795       }
796 
797       if (hasAllWUsers(MI, ST, MRI)) {
798         LLVM_DEBUG(dbgs() << "Replacing " << MI);
799         MI.setDesc(TII.get(WOpc));
800         MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
801         MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
802         MI.clearFlag(MachineInstr::MIFlag::IsExact);
803         LLVM_DEBUG(dbgs() << "     with " << MI);
804         ++NumTransformedToWInstrs;
805         MadeChange = true;
806       }
807     }
808   }
809 
810   return MadeChange;
811 }
812 
813 bool LoongArchOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
814   if (skipFunction(MF.getFunction()))
815     return false;
816 
817   MachineRegisterInfo &MRI = MF.getRegInfo();
818   const LoongArchSubtarget &ST = MF.getSubtarget<LoongArchSubtarget>();
819   const LoongArchInstrInfo &TII = *ST.getInstrInfo();
820 
821   if (!ST.is64Bit())
822     return false;
823 
824   bool MadeChange = false;
825   MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
826 
827   if (!(DisableCvtToDSuffix || ST.preferWInst()))
828     MadeChange |= convertToDSuffixes(MF, TII, ST, MRI);
829 
830   if (ST.preferWInst())
831     MadeChange |= convertToWSuffixes(MF, TII, ST, MRI);
832 
833   return MadeChange;
834 }
835