xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp (revision 4f5890a0fb086324a657f3cd7ba1abc57274e0db)
1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64TargetMachine.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/MachineFunction.h"
24 #include "llvm/CodeGen/MachineFunctionPass.h"
25 #include "llvm/CodeGen/MachineRegisterInfo.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/Support/Debug.h"
29 
30 #define DEBUG_TYPE "aarch64-prelegalizer-combiner"
31 
32 using namespace llvm;
33 using namespace MIPatternMatch;
34 
35 /// Return true if a G_FCONSTANT instruction is known to be better-represented
36 /// as a G_CONSTANT.
37 static bool matchFConstantToConstant(MachineInstr &MI,
38                                      MachineRegisterInfo &MRI) {
39   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
40   Register DstReg = MI.getOperand(0).getReg();
41   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
42   if (DstSize != 32 && DstSize != 64)
43     return false;
44 
45   // When we're storing a value, it doesn't matter what register bank it's on.
46   // Since not all floating point constants can be materialized using a fmov,
47   // it makes more sense to just use a GPR.
48   return all_of(MRI.use_nodbg_instructions(DstReg),
49                 [](const MachineInstr &Use) { return Use.mayStore(); });
50 }
51 
52 /// Change a G_FCONSTANT into a G_CONSTANT.
53 static void applyFConstantToConstant(MachineInstr &MI) {
54   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
55   MachineIRBuilder MIB(MI);
56   const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
57   MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
58   MI.eraseFromParent();
59 }
60 
61 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
62 /// are sign bits. In this case, we can transform the G_ICMP to directly compare
63 /// the wide value with a zero.
64 static bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
65                                     GISelKnownBits *KB, Register &MatchInfo) {
66   assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
67 
68   auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
69   if (!ICmpInst::isEquality(Pred))
70     return false;
71 
72   Register LHS = MI.getOperand(2).getReg();
73   LLT LHSTy = MRI.getType(LHS);
74   if (!LHSTy.isScalar())
75     return false;
76 
77   Register RHS = MI.getOperand(3).getReg();
78   Register WideReg;
79 
80   if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
81       !mi_match(RHS, MRI, m_SpecificICst(0)))
82     return false;
83 
84   LLT WideTy = MRI.getType(WideReg);
85   if (KB->computeNumSignBits(WideReg) <=
86       WideTy.getSizeInBits() - LHSTy.getSizeInBits())
87     return false;
88 
89   MatchInfo = WideReg;
90   return true;
91 }
92 
93 static bool applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
94                                     MachineIRBuilder &Builder,
95                                     GISelChangeObserver &Observer,
96                                     Register &WideReg) {
97   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
98 
99   LLT WideTy = MRI.getType(WideReg);
100   // We're going to directly use the wide register as the LHS, and then use an
101   // equivalent size zero for RHS.
102   Builder.setInstrAndDebugLoc(MI);
103   auto WideZero = Builder.buildConstant(WideTy, 0);
104   Observer.changingInstr(MI);
105   MI.getOperand(2).setReg(WideReg);
106   MI.getOperand(3).setReg(WideZero.getReg(0));
107   Observer.changedInstr(MI);
108   return true;
109 }
110 
111 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
112 ///
113 /// e.g.
114 ///
115 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
116 static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
117                                   std::pair<uint64_t, uint64_t> &MatchInfo) {
118   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
119   MachineFunction &MF = *MI.getMF();
120   auto &GlobalOp = MI.getOperand(1);
121   auto *GV = GlobalOp.getGlobal();
122   if (GV->isThreadLocal())
123     return false;
124 
125   // Don't allow anything that could represent offsets etc.
126   if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
127           GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
128     return false;
129 
130   // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
131   //
132   //  %g = G_GLOBAL_VALUE @x
133   //  %ptr1 = G_PTR_ADD %g, cst1
134   //  %ptr2 = G_PTR_ADD %g, cst2
135   //  ...
136   //  %ptrN = G_PTR_ADD %g, cstN
137   //
138   // Identify the *smallest* constant. We want to be able to form this:
139   //
140   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
141   //  %g = G_PTR_ADD %offset_g, -min_cst
142   //  %ptr1 = G_PTR_ADD %g, cst1
143   //  ...
144   Register Dst = MI.getOperand(0).getReg();
145   uint64_t MinOffset = -1ull;
146   for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
147     if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
148       return false;
149     auto Cst = getIConstantVRegValWithLookThrough(
150         UseInstr.getOperand(2).getReg(), MRI);
151     if (!Cst)
152       return false;
153     MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
154   }
155 
156   // Require that the new offset is larger than the existing one to avoid
157   // infinite loops.
158   uint64_t CurrOffset = GlobalOp.getOffset();
159   uint64_t NewOffset = MinOffset + CurrOffset;
160   if (NewOffset <= CurrOffset)
161     return false;
162 
163   // Check whether folding this offset is legal. It must not go out of bounds of
164   // the referenced object to avoid violating the code model, and must be
165   // smaller than 2^20 because this is the largest offset expressible in all
166   // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
167   // stores an immediate signed 21 bit offset.)
168   //
169   // This check also prevents us from folding negative offsets, which will end
170   // up being treated in the same way as large positive ones. They could also
171   // cause code model violations, and aren't really common enough to matter.
172   if (NewOffset >= (1 << 20))
173     return false;
174 
175   Type *T = GV->getValueType();
176   if (!T->isSized() ||
177       NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
178     return false;
179   MatchInfo = std::make_pair(NewOffset, MinOffset);
180   return true;
181 }
182 
183 static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
184                                   MachineIRBuilder &B,
185                                   GISelChangeObserver &Observer,
186                                   std::pair<uint64_t, uint64_t> &MatchInfo) {
187   // Change:
188   //
189   //  %g = G_GLOBAL_VALUE @x
190   //  %ptr1 = G_PTR_ADD %g, cst1
191   //  %ptr2 = G_PTR_ADD %g, cst2
192   //  ...
193   //  %ptrN = G_PTR_ADD %g, cstN
194   //
195   // To:
196   //
197   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
198   //  %g = G_PTR_ADD %offset_g, -min_cst
199   //  %ptr1 = G_PTR_ADD %g, cst1
200   //  ...
201   //  %ptrN = G_PTR_ADD %g, cstN
202   //
203   // Then, the original G_PTR_ADDs should be folded later on so that they look
204   // like this:
205   //
206   //  %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
207   uint64_t Offset, MinOffset;
208   std::tie(Offset, MinOffset) = MatchInfo;
209   B.setInstrAndDebugLoc(MI);
210   Observer.changingInstr(MI);
211   auto &GlobalOp = MI.getOperand(1);
212   auto *GV = GlobalOp.getGlobal();
213   GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
214   Register Dst = MI.getOperand(0).getReg();
215   Register NewGVDst = MRI.cloneVirtualRegister(Dst);
216   MI.getOperand(0).setReg(NewGVDst);
217   Observer.changedInstr(MI);
218   B.buildPtrAdd(
219       Dst, NewGVDst,
220       B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
221   return true;
222 }
223 
224 static bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
225                                CombinerHelper &Helper,
226                                GISelChangeObserver &Observer) {
227   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
228   // result is only used in the no-overflow case. It is restricted to cases
229   // where we know that the high-bits of the operands are 0. If there's an
230   // overflow, then the the 9th or 17th bit must be set, which can be checked
231   // using TBNZ.
232   //
233   // Change (for UADDOs on 8 and 16 bits):
234   //
235   //   %z0 = G_ASSERT_ZEXT _
236   //   %op0 = G_TRUNC %z0
237   //   %z1 = G_ASSERT_ZEXT _
238   //   %op1 = G_TRUNC %z1
239   //   %val, %cond = G_UADDO %op0, %op1
240   //   G_BRCOND %cond, %error.bb
241   //
242   // error.bb:
243   //   (no successors and no uses of %val)
244   //
245   // To:
246   //
247   //   %z0 = G_ASSERT_ZEXT _
248   //   %z1 = G_ASSERT_ZEXT _
249   //   %add = G_ADD %z0, %z1
250   //   %val = G_TRUNC %add
251   //   %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
252   //   %cond = G_ICMP NE, %bit, 0
253   //   G_BRCOND %cond, %error.bb
254 
255   auto &MRI = *B.getMRI();
256 
257   MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
258   MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
259   Register Op0Wide;
260   Register Op1Wide;
261   if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
262       !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
263     return false;
264   LLT WideTy0 = MRI.getType(Op0Wide);
265   LLT WideTy1 = MRI.getType(Op1Wide);
266   Register ResVal = MI.getOperand(0).getReg();
267   LLT OpTy = MRI.getType(ResVal);
268   MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
269   MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
270 
271   unsigned OpTySize = OpTy.getScalarSizeInBits();
272   // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
273   // inputs have been zero-extended.
274   if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
275       Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
276       OpTySize != Op0WideDef->getOperand(2).getImm() ||
277       OpTySize != Op1WideDef->getOperand(2).getImm())
278     return false;
279 
280   // Only scalar UADDO with either 8 or 16 bit operands are handled.
281   if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
282       OpTySize >= WideTy0.getScalarSizeInBits() ||
283       (OpTySize != 8 && OpTySize != 16))
284     return false;
285 
286   // The overflow-status result must be used by a branch only.
287   Register ResStatus = MI.getOperand(1).getReg();
288   if (!MRI.hasOneNonDBGUse(ResStatus))
289     return false;
290   MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
291   if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
292     return false;
293 
294   // Make sure the computed result is only used in the no-overflow blocks.
295   MachineBasicBlock *CurrentMBB = MI.getParent();
296   MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
297   if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
298     return false;
299   if (any_of(MRI.use_nodbg_instructions(ResVal),
300              [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
301                return &MI != &I &&
302                       (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
303              }))
304     return false;
305 
306   // Remove G_ADDO.
307   B.setInstrAndDebugLoc(*MI.getNextNode());
308   MI.eraseFromParent();
309 
310   // Emit wide add.
311   Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
312   B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
313 
314   // Emit check of the 9th or 17th bit and update users (the branch). This will
315   // later be folded to TBNZ.
316   Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
317   B.buildAnd(
318       CondBit, AddDst,
319       B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
320   B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
321               B.buildConstant(LLT::scalar(32), 0));
322 
323   // Update ZEXts users of the result value. Because all uses are in the
324   // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
325   B.buildZExtOrTrunc(ResVal, AddDst);
326   for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
327     Register WideReg;
328     if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
329       auto OldR = U.getParent()->getOperand(0).getReg();
330       Observer.erasingInstr(*U.getParent());
331       U.getParent()->eraseFromParent();
332       Helper.replaceRegWith(MRI, OldR, AddDst);
333     }
334   }
335 
336   return true;
337 }
338 
339 class AArch64PreLegalizerCombinerHelperState {
340 protected:
341   CombinerHelper &Helper;
342 
343 public:
344   AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper)
345       : Helper(Helper) {}
346 };
347 
348 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
349 #include "AArch64GenPreLegalizeGICombiner.inc"
350 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
351 
352 namespace {
353 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
354 #include "AArch64GenPreLegalizeGICombiner.inc"
355 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
356 
357 class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
358   GISelKnownBits *KB;
359   MachineDominatorTree *MDT;
360   AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
361 
362 public:
363   AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
364                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
365       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
366                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
367         KB(KB), MDT(MDT) {
368     if (!GeneratedRuleCfg.parseCommandLineOption())
369       report_fatal_error("Invalid rule identifier");
370   }
371 
372   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
373                        MachineIRBuilder &B) const override;
374 };
375 
376 bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
377                                               MachineInstr &MI,
378                                               MachineIRBuilder &B) const {
379   CombinerHelper Helper(Observer, B, KB, MDT);
380   AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
381 
382   if (Generated.tryCombineAll(Observer, MI, B))
383     return true;
384 
385   unsigned Opc = MI.getOpcode();
386   switch (Opc) {
387   case TargetOpcode::G_CONCAT_VECTORS:
388     return Helper.tryCombineConcatVectors(MI);
389   case TargetOpcode::G_SHUFFLE_VECTOR:
390     return Helper.tryCombineShuffleVector(MI);
391   case TargetOpcode::G_UADDO:
392     return tryToSimplifyUADDO(MI, B, Helper, Observer);
393   case TargetOpcode::G_MEMCPY_INLINE:
394     return Helper.tryEmitMemcpyInline(MI);
395   case TargetOpcode::G_MEMCPY:
396   case TargetOpcode::G_MEMMOVE:
397   case TargetOpcode::G_MEMSET: {
398     // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
399     // heuristics decide.
400     unsigned MaxLen = EnableOpt ? 0 : 32;
401     // Try to inline memcpy type calls if optimizations are enabled.
402     if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
403       return true;
404     if (Opc == TargetOpcode::G_MEMSET)
405       return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
406     return false;
407   }
408   }
409 
410   return false;
411 }
412 
413 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
414 #include "AArch64GenPreLegalizeGICombiner.inc"
415 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
416 
417 // Pass boilerplate
418 // ================
419 
420 class AArch64PreLegalizerCombiner : public MachineFunctionPass {
421 public:
422   static char ID;
423 
424   AArch64PreLegalizerCombiner();
425 
426   StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
427 
428   bool runOnMachineFunction(MachineFunction &MF) override;
429 
430   void getAnalysisUsage(AnalysisUsage &AU) const override;
431 };
432 } // end anonymous namespace
433 
434 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
435   AU.addRequired<TargetPassConfig>();
436   AU.setPreservesCFG();
437   getSelectionDAGFallbackAnalysisUsage(AU);
438   AU.addRequired<GISelKnownBitsAnalysis>();
439   AU.addPreserved<GISelKnownBitsAnalysis>();
440   AU.addRequired<MachineDominatorTree>();
441   AU.addPreserved<MachineDominatorTree>();
442   AU.addRequired<GISelCSEAnalysisWrapperPass>();
443   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
444   MachineFunctionPass::getAnalysisUsage(AU);
445 }
446 
447 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
448     : MachineFunctionPass(ID) {
449   initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
450 }
451 
452 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
453   if (MF.getProperties().hasProperty(
454           MachineFunctionProperties::Property::FailedISel))
455     return false;
456   auto &TPC = getAnalysis<TargetPassConfig>();
457 
458   // Enable CSE.
459   GISelCSEAnalysisWrapper &Wrapper =
460       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
461   auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
462 
463   const Function &F = MF.getFunction();
464   bool EnableOpt =
465       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
466   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
467   MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
468   AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
469                                          F.hasMinSize(), KB, MDT);
470   Combiner C(PCInfo, &TPC);
471   return C.combineMachineInstrs(MF, CSEInfo);
472 }
473 
474 char AArch64PreLegalizerCombiner::ID = 0;
475 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
476                       "Combine AArch64 machine instrs before legalization",
477                       false, false)
478 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
479 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
480 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
481 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
482                     "Combine AArch64 machine instrs before legalization", false,
483                     false)
484 
485 
486 namespace llvm {
487 FunctionPass *createAArch64PreLegalizerCombiner() {
488   return new AArch64PreLegalizerCombiner();
489 }
490 } // end namespace llvm
491