xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp (revision d13def78ccef6dbc25c2e197089ee5fc4d7b82c3)
1 //===-- lib/CodeGen/GlobalISel/GICombinerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
9 #include "llvm/CodeGen/GlobalISel/Combiner.h"
10 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
11 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
13 #include "llvm/CodeGen/GlobalISel/Utils.h"
14 #include "llvm/CodeGen/MachineDominators.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/MachineInstr.h"
17 #include "llvm/CodeGen/MachineRegisterInfo.h"
18 #include "llvm/CodeGen/TargetInstrInfo.h"
19 #include "llvm/CodeGen/TargetLowering.h"
20 #include "llvm/Target/TargetMachine.h"
21 
22 #define DEBUG_TYPE "gi-combiner"
23 
24 using namespace llvm;
25 
26 // Option to allow testing of the combiner while no targets know about indexed
27 // addressing.
28 static cl::opt<bool>
29     ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
30                        cl::desc("Force all indexed operations to be "
31                                 "legal for the GlobalISel combiner"));
32 
33 
34 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
35                                MachineIRBuilder &B, GISelKnownBits *KB,
36                                MachineDominatorTree *MDT)
37     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
38       KB(KB), MDT(MDT) {
39   (void)this->KB;
40 }
41 
42 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
43                                     Register ToReg) const {
44   Observer.changingAllUsesOfReg(MRI, FromReg);
45 
46   if (MRI.constrainRegAttrs(ToReg, FromReg))
47     MRI.replaceRegWith(FromReg, ToReg);
48   else
49     Builder.buildCopy(ToReg, FromReg);
50 
51   Observer.finishedChangingAllUsesOfReg();
52 }
53 
54 void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
55                                       MachineOperand &FromRegOp,
56                                       Register ToReg) const {
57   assert(FromRegOp.getParent() && "Expected an operand in an MI");
58   Observer.changingInstr(*FromRegOp.getParent());
59 
60   FromRegOp.setReg(ToReg);
61 
62   Observer.changedInstr(*FromRegOp.getParent());
63 }
64 
65 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
66   if (matchCombineCopy(MI)) {
67     applyCombineCopy(MI);
68     return true;
69   }
70   return false;
71 }
72 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
73   if (MI.getOpcode() != TargetOpcode::COPY)
74     return false;
75   Register DstReg = MI.getOperand(0).getReg();
76   Register SrcReg = MI.getOperand(1).getReg();
77 
78   // Give up if either DstReg or SrcReg  is a physical register.
79   if (Register::isPhysicalRegister(DstReg) ||
80       Register::isPhysicalRegister(SrcReg))
81     return false;
82 
83   // Give up the types don't match.
84   LLT DstTy = MRI.getType(DstReg);
85   LLT SrcTy = MRI.getType(SrcReg);
86   // Give up if one has a valid LLT, but the other doesn't.
87   if (DstTy.isValid() != SrcTy.isValid())
88     return false;
89   // Give up if the types don't match.
90   if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
91     return false;
92 
93   // Get the register banks and classes.
94   const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
95   const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
96   const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
97   const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
98 
99   // Replace if the register constraints match.
100   if ((SrcRC == DstRC) && (SrcBank == DstBank))
101     return true;
102   // Replace if DstReg has no constraints.
103   if (!DstBank && !DstRC)
104     return true;
105 
106   return false;
107 }
108 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
109   Register DstReg = MI.getOperand(0).getReg();
110   Register SrcReg = MI.getOperand(1).getReg();
111   MI.eraseFromParent();
112   replaceRegWith(MRI, DstReg, SrcReg);
113 }
114 
115 bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
116   bool IsUndef = false;
117   SmallVector<Register, 4> Ops;
118   if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
119     applyCombineConcatVectors(MI, IsUndef, Ops);
120     return true;
121   }
122   return false;
123 }
124 
125 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
126                                                SmallVectorImpl<Register> &Ops) {
127   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
128          "Invalid instruction");
129   IsUndef = true;
130   MachineInstr *Undef = nullptr;
131 
132   // Walk over all the operands of concat vectors and check if they are
133   // build_vector themselves or undef.
134   // Then collect their operands in Ops.
135   for (const MachineOperand &MO : MI.uses()) {
136     Register Reg = MO.getReg();
137     MachineInstr *Def = MRI.getVRegDef(Reg);
138     assert(Def && "Operand not defined");
139     switch (Def->getOpcode()) {
140     case TargetOpcode::G_BUILD_VECTOR:
141       IsUndef = false;
142       // Remember the operands of the build_vector to fold
143       // them into the yet-to-build flattened concat vectors.
144       for (const MachineOperand &BuildVecMO : Def->uses())
145         Ops.push_back(BuildVecMO.getReg());
146       break;
147     case TargetOpcode::G_IMPLICIT_DEF: {
148       LLT OpType = MRI.getType(Reg);
149       // Keep one undef value for all the undef operands.
150       if (!Undef) {
151         Builder.setInsertPt(*MI.getParent(), MI);
152         Undef = Builder.buildUndef(OpType.getScalarType());
153       }
154       assert(MRI.getType(Undef->getOperand(0).getReg()) ==
155                  OpType.getScalarType() &&
156              "All undefs should have the same type");
157       // Break the undef vector in as many scalar elements as needed
158       // for the flattening.
159       for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
160            EltIdx != EltEnd; ++EltIdx)
161         Ops.push_back(Undef->getOperand(0).getReg());
162       break;
163     }
164     default:
165       return false;
166     }
167   }
168   return true;
169 }
170 void CombinerHelper::applyCombineConcatVectors(
171     MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
172   // We determined that the concat_vectors can be flatten.
173   // Generate the flattened build_vector.
174   Register DstReg = MI.getOperand(0).getReg();
175   Builder.setInsertPt(*MI.getParent(), MI);
176   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
177 
178   // Note: IsUndef is sort of redundant. We could have determine it by
179   // checking that at all Ops are undef.  Alternatively, we could have
180   // generate a build_vector of undefs and rely on another combine to
181   // clean that up.  For now, given we already gather this information
182   // in tryCombineConcatVectors, just save compile time and issue the
183   // right thing.
184   if (IsUndef)
185     Builder.buildUndef(NewDstReg);
186   else
187     Builder.buildBuildVector(NewDstReg, Ops);
188   MI.eraseFromParent();
189   replaceRegWith(MRI, DstReg, NewDstReg);
190 }
191 
192 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
193   SmallVector<Register, 4> Ops;
194   if (matchCombineShuffleVector(MI, Ops)) {
195     applyCombineShuffleVector(MI, Ops);
196     return true;
197   }
198   return false;
199 }
200 
201 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
202                                                SmallVectorImpl<Register> &Ops) {
203   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
204          "Invalid instruction kind");
205   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
206   Register Src1 = MI.getOperand(1).getReg();
207   LLT SrcType = MRI.getType(Src1);
208   // As bizarre as it may look, shuffle vector can actually produce
209   // scalar! This is because at the IR level a <1 x ty> shuffle
210   // vector is perfectly valid.
211   unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
212   unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
213 
214   // If the resulting vector is smaller than the size of the source
215   // vectors being concatenated, we won't be able to replace the
216   // shuffle vector into a concat_vectors.
217   //
218   // Note: We may still be able to produce a concat_vectors fed by
219   //       extract_vector_elt and so on. It is less clear that would
220   //       be better though, so don't bother for now.
221   //
222   // If the destination is a scalar, the size of the sources doesn't
223   // matter. we will lower the shuffle to a plain copy. This will
224   // work only if the source and destination have the same size. But
225   // that's covered by the next condition.
226   //
227   // TODO: If the size between the source and destination don't match
228   //       we could still emit an extract vector element in that case.
229   if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
230     return false;
231 
232   // Check that the shuffle mask can be broken evenly between the
233   // different sources.
234   if (DstNumElts % SrcNumElts != 0)
235     return false;
236 
237   // Mask length is a multiple of the source vector length.
238   // Check if the shuffle is some kind of concatenation of the input
239   // vectors.
240   unsigned NumConcat = DstNumElts / SrcNumElts;
241   SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
242   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
243   for (unsigned i = 0; i != DstNumElts; ++i) {
244     int Idx = Mask[i];
245     // Undef value.
246     if (Idx < 0)
247       continue;
248     // Ensure the indices in each SrcType sized piece are sequential and that
249     // the same source is used for the whole piece.
250     if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
251         (ConcatSrcs[i / SrcNumElts] >= 0 &&
252          ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
253       return false;
254     // Remember which source this index came from.
255     ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
256   }
257 
258   // The shuffle is concatenating multiple vectors together.
259   // Collect the different operands for that.
260   Register UndefReg;
261   Register Src2 = MI.getOperand(2).getReg();
262   for (auto Src : ConcatSrcs) {
263     if (Src < 0) {
264       if (!UndefReg) {
265         Builder.setInsertPt(*MI.getParent(), MI);
266         UndefReg = Builder.buildUndef(SrcType).getReg(0);
267       }
268       Ops.push_back(UndefReg);
269     } else if (Src == 0)
270       Ops.push_back(Src1);
271     else
272       Ops.push_back(Src2);
273   }
274   return true;
275 }
276 
277 void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
278                                                const ArrayRef<Register> Ops) {
279   Register DstReg = MI.getOperand(0).getReg();
280   Builder.setInsertPt(*MI.getParent(), MI);
281   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
282 
283   if (Ops.size() == 1)
284     Builder.buildCopy(NewDstReg, Ops[0]);
285   else
286     Builder.buildMerge(NewDstReg, Ops);
287 
288   MI.eraseFromParent();
289   replaceRegWith(MRI, DstReg, NewDstReg);
290 }
291 
292 namespace {
293 
294 /// Select a preference between two uses. CurrentUse is the current preference
295 /// while *ForCandidate is attributes of the candidate under consideration.
296 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
297                                   const LLT &TyForCandidate,
298                                   unsigned OpcodeForCandidate,
299                                   MachineInstr *MIForCandidate) {
300   if (!CurrentUse.Ty.isValid()) {
301     if (CurrentUse.ExtendOpcode == OpcodeForCandidate ||
302         CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT)
303       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
304     return CurrentUse;
305   }
306 
307   // We permit the extend to hoist through basic blocks but this is only
308   // sensible if the target has extending loads. If you end up lowering back
309   // into a load and extend during the legalizer then the end result is
310   // hoisting the extend up to the load.
311 
312   // Prefer defined extensions to undefined extensions as these are more
313   // likely to reduce the number of instructions.
314   if (OpcodeForCandidate == TargetOpcode::G_ANYEXT &&
315       CurrentUse.ExtendOpcode != TargetOpcode::G_ANYEXT)
316     return CurrentUse;
317   else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ANYEXT &&
318            OpcodeForCandidate != TargetOpcode::G_ANYEXT)
319     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
320 
321   // Prefer sign extensions to zero extensions as sign-extensions tend to be
322   // more expensive.
323   if (CurrentUse.Ty == TyForCandidate) {
324     if (CurrentUse.ExtendOpcode == TargetOpcode::G_SEXT &&
325         OpcodeForCandidate == TargetOpcode::G_ZEXT)
326       return CurrentUse;
327     else if (CurrentUse.ExtendOpcode == TargetOpcode::G_ZEXT &&
328              OpcodeForCandidate == TargetOpcode::G_SEXT)
329       return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
330   }
331 
332   // This is potentially target specific. We've chosen the largest type
333   // because G_TRUNC is usually free. One potential catch with this is that
334   // some targets have a reduced number of larger registers than smaller
335   // registers and this choice potentially increases the live-range for the
336   // larger value.
337   if (TyForCandidate.getSizeInBits() > CurrentUse.Ty.getSizeInBits()) {
338     return {TyForCandidate, OpcodeForCandidate, MIForCandidate};
339   }
340   return CurrentUse;
341 }
342 
343 /// Find a suitable place to insert some instructions and insert them. This
344 /// function accounts for special cases like inserting before a PHI node.
345 /// The current strategy for inserting before PHI's is to duplicate the
346 /// instructions for each predecessor. However, while that's ok for G_TRUNC
347 /// on most targets since it generally requires no code, other targets/cases may
348 /// want to try harder to find a dominating block.
349 static void InsertInsnsWithoutSideEffectsBeforeUse(
350     MachineIRBuilder &Builder, MachineInstr &DefMI, MachineOperand &UseMO,
351     std::function<void(MachineBasicBlock *, MachineBasicBlock::iterator,
352                        MachineOperand &UseMO)>
353         Inserter) {
354   MachineInstr &UseMI = *UseMO.getParent();
355 
356   MachineBasicBlock *InsertBB = UseMI.getParent();
357 
358   // If the use is a PHI then we want the predecessor block instead.
359   if (UseMI.isPHI()) {
360     MachineOperand *PredBB = std::next(&UseMO);
361     InsertBB = PredBB->getMBB();
362   }
363 
364   // If the block is the same block as the def then we want to insert just after
365   // the def instead of at the start of the block.
366   if (InsertBB == DefMI.getParent()) {
367     MachineBasicBlock::iterator InsertPt = &DefMI;
368     Inserter(InsertBB, std::next(InsertPt), UseMO);
369     return;
370   }
371 
372   // Otherwise we want the start of the BB
373   Inserter(InsertBB, InsertBB->getFirstNonPHI(), UseMO);
374 }
375 } // end anonymous namespace
376 
377 bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
378   PreferredTuple Preferred;
379   if (matchCombineExtendingLoads(MI, Preferred)) {
380     applyCombineExtendingLoads(MI, Preferred);
381     return true;
382   }
383   return false;
384 }
385 
386 bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
387                                                 PreferredTuple &Preferred) {
388   // We match the loads and follow the uses to the extend instead of matching
389   // the extends and following the def to the load. This is because the load
390   // must remain in the same position for correctness (unless we also add code
391   // to find a safe place to sink it) whereas the extend is freely movable.
392   // It also prevents us from duplicating the load for the volatile case or just
393   // for performance.
394 
395   if (MI.getOpcode() != TargetOpcode::G_LOAD &&
396       MI.getOpcode() != TargetOpcode::G_SEXTLOAD &&
397       MI.getOpcode() != TargetOpcode::G_ZEXTLOAD)
398     return false;
399 
400   auto &LoadValue = MI.getOperand(0);
401   assert(LoadValue.isReg() && "Result wasn't a register?");
402 
403   LLT LoadValueTy = MRI.getType(LoadValue.getReg());
404   if (!LoadValueTy.isScalar())
405     return false;
406 
407   // Most architectures are going to legalize <s8 loads into at least a 1 byte
408   // load, and the MMOs can only describe memory accesses in multiples of bytes.
409   // If we try to perform extload combining on those, we can end up with
410   // %a(s8) = extload %ptr (load 1 byte from %ptr)
411   // ... which is an illegal extload instruction.
412   if (LoadValueTy.getSizeInBits() < 8)
413     return false;
414 
415   // For non power-of-2 types, they will very likely be legalized into multiple
416   // loads. Don't bother trying to match them into extending loads.
417   if (!isPowerOf2_32(LoadValueTy.getSizeInBits()))
418     return false;
419 
420   // Find the preferred type aside from the any-extends (unless it's the only
421   // one) and non-extending ops. We'll emit an extending load to that type and
422   // and emit a variant of (extend (trunc X)) for the others according to the
423   // relative type sizes. At the same time, pick an extend to use based on the
424   // extend involved in the chosen type.
425   unsigned PreferredOpcode = MI.getOpcode() == TargetOpcode::G_LOAD
426                                  ? TargetOpcode::G_ANYEXT
427                                  : MI.getOpcode() == TargetOpcode::G_SEXTLOAD
428                                        ? TargetOpcode::G_SEXT
429                                        : TargetOpcode::G_ZEXT;
430   Preferred = {LLT(), PreferredOpcode, nullptr};
431   for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
432     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
433         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
434         UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
435       Preferred = ChoosePreferredUse(Preferred,
436                                      MRI.getType(UseMI.getOperand(0).getReg()),
437                                      UseMI.getOpcode(), &UseMI);
438     }
439   }
440 
441   // There were no extends
442   if (!Preferred.MI)
443     return false;
444   // It should be impossible to chose an extend without selecting a different
445   // type since by definition the result of an extend is larger.
446   assert(Preferred.Ty != LoadValueTy && "Extending to same type?");
447 
448   LLVM_DEBUG(dbgs() << "Preferred use is: " << *Preferred.MI);
449   return true;
450 }
451 
452 void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
453                                                 PreferredTuple &Preferred) {
454   // Rewrite the load to the chosen extending load.
455   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
456 
457   // Inserter to insert a truncate back to the original type at a given point
458   // with some basic CSE to limit truncate duplication to one per BB.
459   DenseMap<MachineBasicBlock *, MachineInstr *> EmittedInsns;
460   auto InsertTruncAt = [&](MachineBasicBlock *InsertIntoBB,
461                            MachineBasicBlock::iterator InsertBefore,
462                            MachineOperand &UseMO) {
463     MachineInstr *PreviouslyEmitted = EmittedInsns.lookup(InsertIntoBB);
464     if (PreviouslyEmitted) {
465       Observer.changingInstr(*UseMO.getParent());
466       UseMO.setReg(PreviouslyEmitted->getOperand(0).getReg());
467       Observer.changedInstr(*UseMO.getParent());
468       return;
469     }
470 
471     Builder.setInsertPt(*InsertIntoBB, InsertBefore);
472     Register NewDstReg = MRI.cloneVirtualRegister(MI.getOperand(0).getReg());
473     MachineInstr *NewMI = Builder.buildTrunc(NewDstReg, ChosenDstReg);
474     EmittedInsns[InsertIntoBB] = NewMI;
475     replaceRegOpWith(MRI, UseMO, NewDstReg);
476   };
477 
478   Observer.changingInstr(MI);
479   MI.setDesc(
480       Builder.getTII().get(Preferred.ExtendOpcode == TargetOpcode::G_SEXT
481                                ? TargetOpcode::G_SEXTLOAD
482                                : Preferred.ExtendOpcode == TargetOpcode::G_ZEXT
483                                      ? TargetOpcode::G_ZEXTLOAD
484                                      : TargetOpcode::G_LOAD));
485 
486   // Rewrite all the uses to fix up the types.
487   auto &LoadValue = MI.getOperand(0);
488   SmallVector<MachineOperand *, 4> Uses;
489   for (auto &UseMO : MRI.use_operands(LoadValue.getReg()))
490     Uses.push_back(&UseMO);
491 
492   for (auto *UseMO : Uses) {
493     MachineInstr *UseMI = UseMO->getParent();
494 
495     // If the extend is compatible with the preferred extend then we should fix
496     // up the type and extend so that it uses the preferred use.
497     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
498         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
499       Register UseDstReg = UseMI->getOperand(0).getReg();
500       MachineOperand &UseSrcMO = UseMI->getOperand(1);
501       const LLT &UseDstTy = MRI.getType(UseDstReg);
502       if (UseDstReg != ChosenDstReg) {
503         if (Preferred.Ty == UseDstTy) {
504           // If the use has the same type as the preferred use, then merge
505           // the vregs and erase the extend. For example:
506           //    %1:_(s8) = G_LOAD ...
507           //    %2:_(s32) = G_SEXT %1(s8)
508           //    %3:_(s32) = G_ANYEXT %1(s8)
509           //    ... = ... %3(s32)
510           // rewrites to:
511           //    %2:_(s32) = G_SEXTLOAD ...
512           //    ... = ... %2(s32)
513           replaceRegWith(MRI, UseDstReg, ChosenDstReg);
514           Observer.erasingInstr(*UseMO->getParent());
515           UseMO->getParent()->eraseFromParent();
516         } else if (Preferred.Ty.getSizeInBits() < UseDstTy.getSizeInBits()) {
517           // If the preferred size is smaller, then keep the extend but extend
518           // from the result of the extending load. For example:
519           //    %1:_(s8) = G_LOAD ...
520           //    %2:_(s32) = G_SEXT %1(s8)
521           //    %3:_(s64) = G_ANYEXT %1(s8)
522           //    ... = ... %3(s64)
523           /// rewrites to:
524           //    %2:_(s32) = G_SEXTLOAD ...
525           //    %3:_(s64) = G_ANYEXT %2:_(s32)
526           //    ... = ... %3(s64)
527           replaceRegOpWith(MRI, UseSrcMO, ChosenDstReg);
528         } else {
529           // If the preferred size is large, then insert a truncate. For
530           // example:
531           //    %1:_(s8) = G_LOAD ...
532           //    %2:_(s64) = G_SEXT %1(s8)
533           //    %3:_(s32) = G_ZEXT %1(s8)
534           //    ... = ... %3(s32)
535           /// rewrites to:
536           //    %2:_(s64) = G_SEXTLOAD ...
537           //    %4:_(s8) = G_TRUNC %2:_(s32)
538           //    %3:_(s64) = G_ZEXT %2:_(s8)
539           //    ... = ... %3(s64)
540           InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO,
541                                                  InsertTruncAt);
542         }
543         continue;
544       }
545       // The use is (one of) the uses of the preferred use we chose earlier.
546       // We're going to update the load to def this value later so just erase
547       // the old extend.
548       Observer.erasingInstr(*UseMO->getParent());
549       UseMO->getParent()->eraseFromParent();
550       continue;
551     }
552 
553     // The use isn't an extend. Truncate back to the type we originally loaded.
554     // This is free on many targets.
555     InsertInsnsWithoutSideEffectsBeforeUse(Builder, MI, *UseMO, InsertTruncAt);
556   }
557 
558   MI.getOperand(0).setReg(ChosenDstReg);
559   Observer.changedInstr(MI);
560 }
561 
562 bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
563   assert(DefMI.getParent() == UseMI.getParent());
564   if (&DefMI == &UseMI)
565     return false;
566 
567   // Loop through the basic block until we find one of the instructions.
568   MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
569   for (; &*I != &DefMI && &*I != &UseMI; ++I)
570     return &*I == &DefMI;
571 
572   llvm_unreachable("Block must contain instructions");
573 }
574 
575 bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
576   if (MDT)
577     return MDT->dominates(&DefMI, &UseMI);
578   else if (DefMI.getParent() != UseMI.getParent())
579     return false;
580 
581   return isPredecessor(DefMI, UseMI);
582 }
583 
584 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
585                                             Register &Base, Register &Offset) {
586   auto &MF = *MI.getParent()->getParent();
587   const auto &TLI = *MF.getSubtarget().getTargetLowering();
588 
589 #ifndef NDEBUG
590   unsigned Opcode = MI.getOpcode();
591   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
592          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
593 #endif
594 
595   Base = MI.getOperand(1).getReg();
596   MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
597   if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
598     return false;
599 
600   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
601 
602   for (auto &Use : MRI.use_instructions(Base)) {
603     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
604       continue;
605 
606     Offset = Use.getOperand(2).getReg();
607     if (!ForceLegalIndexing &&
608         !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
609       LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
610                         << Use);
611       continue;
612     }
613 
614     // Make sure the offset calculation is before the potentially indexed op.
615     // FIXME: we really care about dependency here. The offset calculation might
616     // be movable.
617     MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
618     if (!OffsetDef || !dominates(*OffsetDef, MI)) {
619       LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
620                         << Use);
621       continue;
622     }
623 
624     // FIXME: check whether all uses of Base are load/store with foldable
625     // addressing modes. If so, using the normal addr-modes is better than
626     // forming an indexed one.
627 
628     bool MemOpDominatesAddrUses = true;
629     for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
630       if (!dominates(MI, PtrAddUse)) {
631         MemOpDominatesAddrUses = false;
632         break;
633       }
634     }
635 
636     if (!MemOpDominatesAddrUses) {
637       LLVM_DEBUG(
638           dbgs() << "    Ignoring candidate as memop does not dominate uses: "
639                  << Use);
640       continue;
641     }
642 
643     LLVM_DEBUG(dbgs() << "    Found match: " << Use);
644     Addr = Use.getOperand(0).getReg();
645     return true;
646   }
647 
648   return false;
649 }
650 
651 bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
652                                            Register &Base, Register &Offset) {
653   auto &MF = *MI.getParent()->getParent();
654   const auto &TLI = *MF.getSubtarget().getTargetLowering();
655 
656 #ifndef NDEBUG
657   unsigned Opcode = MI.getOpcode();
658   assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
659          Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
660 #endif
661 
662   Addr = MI.getOperand(1).getReg();
663   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
664   if (!AddrDef || MRI.hasOneUse(Addr))
665     return false;
666 
667   Base = AddrDef->getOperand(1).getReg();
668   Offset = AddrDef->getOperand(2).getReg();
669 
670   LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
671 
672   if (!ForceLegalIndexing &&
673       !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
674     LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
675     return false;
676   }
677 
678   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
679   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
680     LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
681     return false;
682   }
683 
684   if (MI.getOpcode() == TargetOpcode::G_STORE) {
685     // Would require a copy.
686     if (Base == MI.getOperand(0).getReg()) {
687       LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
688       return false;
689     }
690 
691     // We're expecting one use of Addr in MI, but it could also be the
692     // value stored, which isn't actually dominated by the instruction.
693     if (MI.getOperand(0).getReg() == Addr) {
694       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
695       return false;
696     }
697   }
698 
699   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
700   // That might allow us to end base's liveness here by adjusting the constant.
701 
702   for (auto &UseMI : MRI.use_instructions(Addr)) {
703     if (!dominates(MI, UseMI)) {
704       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
705       return false;
706     }
707   }
708 
709   return true;
710 }
711 
712 bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
713   IndexedLoadStoreMatchInfo MatchInfo;
714   if (matchCombineIndexedLoadStore(MI, MatchInfo)) {
715     applyCombineIndexedLoadStore(MI, MatchInfo);
716     return true;
717   }
718   return false;
719 }
720 
721 bool CombinerHelper::matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
722   unsigned Opcode = MI.getOpcode();
723   if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
724       Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
725     return false;
726 
727   MatchInfo.IsPre = findPreIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
728                                           MatchInfo.Offset);
729   if (!MatchInfo.IsPre &&
730       !findPostIndexCandidate(MI, MatchInfo.Addr, MatchInfo.Base,
731                               MatchInfo.Offset))
732     return false;
733 
734   return true;
735 }
736 
737 void CombinerHelper::applyCombineIndexedLoadStore(
738     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
739   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
740   MachineIRBuilder MIRBuilder(MI);
741   unsigned Opcode = MI.getOpcode();
742   bool IsStore = Opcode == TargetOpcode::G_STORE;
743   unsigned NewOpcode;
744   switch (Opcode) {
745   case TargetOpcode::G_LOAD:
746     NewOpcode = TargetOpcode::G_INDEXED_LOAD;
747     break;
748   case TargetOpcode::G_SEXTLOAD:
749     NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
750     break;
751   case TargetOpcode::G_ZEXTLOAD:
752     NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
753     break;
754   case TargetOpcode::G_STORE:
755     NewOpcode = TargetOpcode::G_INDEXED_STORE;
756     break;
757   default:
758     llvm_unreachable("Unknown load/store opcode");
759   }
760 
761   auto MIB = MIRBuilder.buildInstr(NewOpcode);
762   if (IsStore) {
763     MIB.addDef(MatchInfo.Addr);
764     MIB.addUse(MI.getOperand(0).getReg());
765   } else {
766     MIB.addDef(MI.getOperand(0).getReg());
767     MIB.addDef(MatchInfo.Addr);
768   }
769 
770   MIB.addUse(MatchInfo.Base);
771   MIB.addUse(MatchInfo.Offset);
772   MIB.addImm(MatchInfo.IsPre);
773   MI.eraseFromParent();
774   AddrDef.eraseFromParent();
775 
776   LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
777 }
778 
779 bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
780   if (MI.getOpcode() != TargetOpcode::G_BR)
781     return false;
782 
783   // Try to match the following:
784   // bb1:
785   //   %c(s32) = G_ICMP pred, %a, %b
786   //   %c1(s1) = G_TRUNC %c(s32)
787   //   G_BRCOND %c1, %bb2
788   //   G_BR %bb3
789   // bb2:
790   // ...
791   // bb3:
792 
793   // The above pattern does not have a fall through to the successor bb2, always
794   // resulting in a branch no matter which path is taken. Here we try to find
795   // and replace that pattern with conditional branch to bb3 and otherwise
796   // fallthrough to bb2.
797 
798   MachineBasicBlock *MBB = MI.getParent();
799   MachineBasicBlock::iterator BrIt(MI);
800   if (BrIt == MBB->begin())
801     return false;
802   assert(std::next(BrIt) == MBB->end() && "expected G_BR to be a terminator");
803 
804   MachineInstr *BrCond = &*std::prev(BrIt);
805   if (BrCond->getOpcode() != TargetOpcode::G_BRCOND)
806     return false;
807 
808   // Check that the next block is the conditional branch target.
809   if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
810     return false;
811 
812   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
813   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
814       !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
815     return false;
816   return true;
817 }
818 
819 bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
820   if (!matchElideBrByInvertingCond(MI))
821     return false;
822   applyElideBrByInvertingCond(MI);
823   return true;
824 }
825 
826 void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
827   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
828   MachineBasicBlock::iterator BrIt(MI);
829   MachineInstr *BrCond = &*std::prev(BrIt);
830   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
831 
832   CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
833       (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
834 
835   // Invert the G_ICMP condition.
836   Observer.changingInstr(*CmpMI);
837   CmpMI->getOperand(1).setPredicate(InversePred);
838   Observer.changedInstr(*CmpMI);
839 
840   // Change the conditional branch target.
841   Observer.changingInstr(*BrCond);
842   BrCond->getOperand(1).setMBB(BrTarget);
843   Observer.changedInstr(*BrCond);
844   MI.eraseFromParent();
845 }
846 
847 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
848   // On Darwin, -Os means optimize for size without hurting performance, so
849   // only really optimize for size when -Oz (MinSize) is used.
850   if (MF.getTarget().getTargetTriple().isOSDarwin())
851     return MF.getFunction().hasMinSize();
852   return MF.getFunction().hasOptSize();
853 }
854 
855 // Returns a list of types to use for memory op lowering in MemOps. A partial
856 // port of findOptimalMemOpLowering in TargetLowering.
857 static bool findGISelOptimalMemOpLowering(
858     std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
859     unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
860     bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
861     const AttributeList &FuncAttributes, const TargetLowering &TLI) {
862   // If 'SrcAlign' is zero, that means the memory operation does not need to
863   // load the value, i.e. memset or memcpy from constant string. Otherwise,
864   // it's the inferred alignment of the source. 'DstAlign', on the other hand,
865   // is the specified alignment of the memory operation. If it is zero, that
866   // means it's possible to change the alignment of the destination.
867   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
868   // not need to be loaded.
869   if (SrcAlign != 0 && SrcAlign < DstAlign)
870     return false;
871 
872   LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
873                                   ZeroMemset, MemcpyStrSrc, FuncAttributes);
874 
875   if (Ty == LLT()) {
876     // Use the largest scalar type whose alignment constraints are satisfied.
877     // We only need to check DstAlign here as SrcAlign is always greater or
878     // equal to DstAlign (or zero).
879     Ty = LLT::scalar(64);
880     while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
881            !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
882       Ty = LLT::scalar(Ty.getSizeInBytes());
883     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
884     // FIXME: check for the largest legal type we can load/store to.
885   }
886 
887   unsigned NumMemOps = 0;
888   while (Size != 0) {
889     unsigned TySize = Ty.getSizeInBytes();
890     while (TySize > Size) {
891       // For now, only use non-vector load / store's for the left-over pieces.
892       LLT NewTy = Ty;
893       // FIXME: check for mem op safety and legality of the types. Not all of
894       // SDAGisms map cleanly to GISel concepts.
895       if (NewTy.isVector())
896         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
897       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
898       unsigned NewTySize = NewTy.getSizeInBytes();
899       assert(NewTySize > 0 && "Could not find appropriate type");
900 
901       // If the new LLT cannot cover all of the remaining bits, then consider
902       // issuing a (or a pair of) unaligned and overlapping load / store.
903       bool Fast;
904       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
905       MVT VT = getMVTForLLT(Ty);
906       if (NumMemOps && AllowOverlap && NewTySize < Size &&
907           TLI.allowsMisalignedMemoryAccesses(
908               VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
909           Fast)
910         TySize = Size;
911       else {
912         Ty = NewTy;
913         TySize = NewTySize;
914       }
915     }
916 
917     if (++NumMemOps > Limit)
918       return false;
919 
920     MemOps.push_back(Ty);
921     Size -= TySize;
922   }
923 
924   return true;
925 }
926 
927 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
928   if (Ty.isVector())
929     return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
930                            Ty.getNumElements());
931   return IntegerType::get(C, Ty.getSizeInBits());
932 }
933 
934 // Get a vectorized representation of the memset value operand, GISel edition.
935 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
936   MachineRegisterInfo &MRI = *MIB.getMRI();
937   unsigned NumBits = Ty.getScalarSizeInBits();
938   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
939   if (!Ty.isVector() && ValVRegAndVal) {
940     unsigned KnownVal = ValVRegAndVal->Value;
941     APInt Scalar = APInt(8, KnownVal);
942     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
943     return MIB.buildConstant(Ty, SplatVal).getReg(0);
944   }
945   // FIXME: for vector types create a G_BUILD_VECTOR.
946   if (Ty.isVector())
947     return Register();
948 
949   // Extend the byte value to the larger type, and then multiply by a magic
950   // value 0x010101... in order to replicate it across every byte.
951   LLT ExtType = Ty.getScalarType();
952   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
953   if (NumBits > 8) {
954     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
955     auto MagicMI = MIB.buildConstant(ExtType, Magic);
956     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
957   }
958 
959   assert(ExtType == Ty && "Vector memset value type not supported yet");
960   return Val;
961 }
962 
963 bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
964                                     unsigned KnownLen, unsigned Align,
965                                     bool IsVolatile) {
966   auto &MF = *MI.getParent()->getParent();
967   const auto &TLI = *MF.getSubtarget().getTargetLowering();
968   auto &DL = MF.getDataLayout();
969   LLVMContext &C = MF.getFunction().getContext();
970 
971   assert(KnownLen != 0 && "Have a zero length memset length!");
972 
973   bool DstAlignCanChange = false;
974   MachineFrameInfo &MFI = MF.getFrameInfo();
975   bool OptSize = shouldLowerMemFuncForSize(MF);
976 
977   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
978   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
979     DstAlignCanChange = true;
980 
981   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
982   std::vector<LLT> MemOps;
983 
984   const auto &DstMMO = **MI.memoperands_begin();
985   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
986 
987   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
988   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
989 
990   if (!findGISelOptimalMemOpLowering(
991           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
992           /*IsMemset=*/true,
993           /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
994           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
995           MF.getFunction().getAttributes(), TLI))
996     return false;
997 
998   if (DstAlignCanChange) {
999     // Get an estimate of the type from the LLT.
1000     Type *IRTy = getTypeForLLT(MemOps[0], C);
1001     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1002     if (NewAlign > Align) {
1003       Align = NewAlign;
1004       unsigned FI = FIDef->getOperand(1).getIndex();
1005       // Give the stack frame object a larger alignment if needed.
1006       if (MFI.getObjectAlignment(FI) < Align)
1007         MFI.setObjectAlignment(FI, Align);
1008     }
1009   }
1010 
1011   MachineIRBuilder MIB(MI);
1012   // Find the largest store and generate the bit pattern for it.
1013   LLT LargestTy = MemOps[0];
1014   for (unsigned i = 1; i < MemOps.size(); i++)
1015     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
1016       LargestTy = MemOps[i];
1017 
1018   // The memset stored value is always defined as an s8, so in order to make it
1019   // work with larger store types we need to repeat the bit pattern across the
1020   // wider type.
1021   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
1022 
1023   if (!MemSetValue)
1024     return false;
1025 
1026   // Generate the stores. For each store type in the list, we generate the
1027   // matching store of that type to the destination address.
1028   LLT PtrTy = MRI.getType(Dst);
1029   unsigned DstOff = 0;
1030   unsigned Size = KnownLen;
1031   for (unsigned I = 0; I < MemOps.size(); I++) {
1032     LLT Ty = MemOps[I];
1033     unsigned TySize = Ty.getSizeInBytes();
1034     if (TySize > Size) {
1035       // Issuing an unaligned load / store pair that overlaps with the previous
1036       // pair. Adjust the offset accordingly.
1037       assert(I == MemOps.size() - 1 && I != 0);
1038       DstOff -= TySize - Size;
1039     }
1040 
1041     // If this store is smaller than the largest store see whether we can get
1042     // the smaller value for free with a truncate.
1043     Register Value = MemSetValue;
1044     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
1045       MVT VT = getMVTForLLT(Ty);
1046       MVT LargestVT = getMVTForLLT(LargestTy);
1047       if (!LargestTy.isVector() && !Ty.isVector() &&
1048           TLI.isTruncateFree(LargestVT, VT))
1049         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
1050       else
1051         Value = getMemsetValue(Val, Ty, MIB);
1052       if (!Value)
1053         return false;
1054     }
1055 
1056     auto *StoreMMO =
1057         MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
1058 
1059     Register Ptr = Dst;
1060     if (DstOff != 0) {
1061       auto Offset =
1062           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
1063       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1064     }
1065 
1066     MIB.buildStore(Value, Ptr, *StoreMMO);
1067     DstOff += Ty.getSizeInBytes();
1068     Size -= TySize;
1069   }
1070 
1071   MI.eraseFromParent();
1072   return true;
1073 }
1074 
1075 
1076 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
1077                                     Register Src, unsigned KnownLen,
1078                                     unsigned DstAlign, unsigned SrcAlign,
1079                                     bool IsVolatile) {
1080   auto &MF = *MI.getParent()->getParent();
1081   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1082   auto &DL = MF.getDataLayout();
1083   LLVMContext &C = MF.getFunction().getContext();
1084 
1085   assert(KnownLen != 0 && "Have a zero length memcpy length!");
1086 
1087   bool DstAlignCanChange = false;
1088   MachineFrameInfo &MFI = MF.getFrameInfo();
1089   bool OptSize = shouldLowerMemFuncForSize(MF);
1090   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1091 
1092   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1093   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1094     DstAlignCanChange = true;
1095 
1096   // FIXME: infer better src pointer alignment like SelectionDAG does here.
1097   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
1098   // if the memcpy is in a tail call position.
1099 
1100   unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
1101   std::vector<LLT> MemOps;
1102 
1103   const auto &DstMMO = **MI.memoperands_begin();
1104   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1105   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1106   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1107 
1108   if (!findGISelOptimalMemOpLowering(
1109           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1110           SrcAlign,
1111           /*IsMemset=*/false,
1112           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1113           /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
1114           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1115     return false;
1116 
1117   if (DstAlignCanChange) {
1118     // Get an estimate of the type from the LLT.
1119     Type *IRTy = getTypeForLLT(MemOps[0], C);
1120     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1121 
1122     // Don't promote to an alignment that would require dynamic stack
1123     // realignment.
1124     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1125     if (!TRI->needsStackRealignment(MF))
1126       while (NewAlign > Alignment &&
1127              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1128         NewAlign /= 2;
1129 
1130     if (NewAlign > Alignment) {
1131       Alignment = NewAlign;
1132       unsigned FI = FIDef->getOperand(1).getIndex();
1133       // Give the stack frame object a larger alignment if needed.
1134       if (MFI.getObjectAlignment(FI) < Alignment)
1135         MFI.setObjectAlignment(FI, Alignment);
1136     }
1137   }
1138 
1139   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
1140 
1141   MachineIRBuilder MIB(MI);
1142   // Now we need to emit a pair of load and stores for each of the types we've
1143   // collected. I.e. for each type, generate a load from the source pointer of
1144   // that type width, and then generate a corresponding store to the dest buffer
1145   // of that value loaded. This can result in a sequence of loads and stores
1146   // mixed types, depending on what the target specifies as good types to use.
1147   unsigned CurrOffset = 0;
1148   LLT PtrTy = MRI.getType(Src);
1149   unsigned Size = KnownLen;
1150   for (auto CopyTy : MemOps) {
1151     // Issuing an unaligned load / store pair  that overlaps with the previous
1152     // pair. Adjust the offset accordingly.
1153     if (CopyTy.getSizeInBytes() > Size)
1154       CurrOffset -= CopyTy.getSizeInBytes() - Size;
1155 
1156     // Construct MMOs for the accesses.
1157     auto *LoadMMO =
1158         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1159     auto *StoreMMO =
1160         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1161 
1162     // Create the load.
1163     Register LoadPtr = Src;
1164     Register Offset;
1165     if (CurrOffset != 0) {
1166       Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
1167                    .getReg(0);
1168       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1169     }
1170     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
1171 
1172     // Create the store.
1173     Register StorePtr =
1174         CurrOffset == 0 ? Dst : MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1175     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
1176     CurrOffset += CopyTy.getSizeInBytes();
1177     Size -= CopyTy.getSizeInBytes();
1178   }
1179 
1180   MI.eraseFromParent();
1181   return true;
1182 }
1183 
1184 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
1185                                     Register Src, unsigned KnownLen,
1186                                     unsigned DstAlign, unsigned SrcAlign,
1187                                     bool IsVolatile) {
1188   auto &MF = *MI.getParent()->getParent();
1189   const auto &TLI = *MF.getSubtarget().getTargetLowering();
1190   auto &DL = MF.getDataLayout();
1191   LLVMContext &C = MF.getFunction().getContext();
1192 
1193   assert(KnownLen != 0 && "Have a zero length memmove length!");
1194 
1195   bool DstAlignCanChange = false;
1196   MachineFrameInfo &MFI = MF.getFrameInfo();
1197   bool OptSize = shouldLowerMemFuncForSize(MF);
1198   unsigned Alignment = MinAlign(DstAlign, SrcAlign);
1199 
1200   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
1201   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
1202     DstAlignCanChange = true;
1203 
1204   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
1205   std::vector<LLT> MemOps;
1206 
1207   const auto &DstMMO = **MI.memoperands_begin();
1208   const auto &SrcMMO = **std::next(MI.memoperands_begin());
1209   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
1210   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
1211 
1212   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
1213   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
1214   // same thing here.
1215   if (!findGISelOptimalMemOpLowering(
1216           MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
1217           SrcAlign,
1218           /*IsMemset=*/false,
1219           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
1220           /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
1221           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
1222     return false;
1223 
1224   if (DstAlignCanChange) {
1225     // Get an estimate of the type from the LLT.
1226     Type *IRTy = getTypeForLLT(MemOps[0], C);
1227     unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
1228 
1229     // Don't promote to an alignment that would require dynamic stack
1230     // realignment.
1231     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1232     if (!TRI->needsStackRealignment(MF))
1233       while (NewAlign > Alignment &&
1234              DL.exceedsNaturalStackAlignment(Align(NewAlign)))
1235         NewAlign /= 2;
1236 
1237     if (NewAlign > Alignment) {
1238       Alignment = NewAlign;
1239       unsigned FI = FIDef->getOperand(1).getIndex();
1240       // Give the stack frame object a larger alignment if needed.
1241       if (MFI.getObjectAlignment(FI) < Alignment)
1242         MFI.setObjectAlignment(FI, Alignment);
1243     }
1244   }
1245 
1246   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
1247 
1248   MachineIRBuilder MIB(MI);
1249   // Memmove requires that we perform the loads first before issuing the stores.
1250   // Apart from that, this loop is pretty much doing the same thing as the
1251   // memcpy codegen function.
1252   unsigned CurrOffset = 0;
1253   LLT PtrTy = MRI.getType(Src);
1254   SmallVector<Register, 16> LoadVals;
1255   for (auto CopyTy : MemOps) {
1256     // Construct MMO for the load.
1257     auto *LoadMMO =
1258         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
1259 
1260     // Create the load.
1261     Register LoadPtr = Src;
1262     if (CurrOffset != 0) {
1263       auto Offset =
1264           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1265       LoadPtr = MIB.buildPtrAdd(PtrTy, Src, Offset).getReg(0);
1266     }
1267     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
1268     CurrOffset += CopyTy.getSizeInBytes();
1269   }
1270 
1271   CurrOffset = 0;
1272   for (unsigned I = 0; I < MemOps.size(); ++I) {
1273     LLT CopyTy = MemOps[I];
1274     // Now store the values loaded.
1275     auto *StoreMMO =
1276         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
1277 
1278     Register StorePtr = Dst;
1279     if (CurrOffset != 0) {
1280       auto Offset =
1281           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
1282       StorePtr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
1283     }
1284     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
1285     CurrOffset += CopyTy.getSizeInBytes();
1286   }
1287   MI.eraseFromParent();
1288   return true;
1289 }
1290 
1291 bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
1292   // This combine is fairly complex so it's not written with a separate
1293   // matcher function.
1294   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
1295   Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
1296   assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
1297           ID == Intrinsic::memset) &&
1298          "Expected a memcpy like intrinsic");
1299 
1300   auto MMOIt = MI.memoperands_begin();
1301   const MachineMemOperand *MemOp = *MMOIt;
1302   bool IsVolatile = MemOp->isVolatile();
1303   // Don't try to optimize volatile.
1304   if (IsVolatile)
1305     return false;
1306 
1307   unsigned DstAlign = MemOp->getBaseAlignment();
1308   unsigned SrcAlign = 0;
1309   Register Dst = MI.getOperand(1).getReg();
1310   Register Src = MI.getOperand(2).getReg();
1311   Register Len = MI.getOperand(3).getReg();
1312 
1313   if (ID != Intrinsic::memset) {
1314     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
1315     MemOp = *(++MMOIt);
1316     SrcAlign = MemOp->getBaseAlignment();
1317   }
1318 
1319   // See if this is a constant length copy
1320   auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
1321   if (!LenVRegAndVal)
1322     return false; // Leave it to the legalizer to lower it to a libcall.
1323   unsigned KnownLen = LenVRegAndVal->Value;
1324 
1325   if (KnownLen == 0) {
1326     MI.eraseFromParent();
1327     return true;
1328   }
1329 
1330   if (MaxLen && KnownLen > MaxLen)
1331     return false;
1332 
1333   if (ID == Intrinsic::memcpy)
1334     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1335   if (ID == Intrinsic::memmove)
1336     return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
1337   if (ID == Intrinsic::memset)
1338     return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
1339   return false;
1340 }
1341 
1342 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
1343                                            PtrAddChain &MatchInfo) {
1344   // We're trying to match the following pattern:
1345   //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
1346   //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
1347   // -->
1348   //   %root = G_PTR_ADD %base, G_CONSTANT (imm1 + imm2)
1349 
1350   if (MI.getOpcode() != TargetOpcode::G_PTR_ADD)
1351     return false;
1352 
1353   Register Add2 = MI.getOperand(1).getReg();
1354   Register Imm1 = MI.getOperand(2).getReg();
1355   auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI);
1356   if (!MaybeImmVal)
1357     return false;
1358 
1359   MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
1360   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
1361     return false;
1362 
1363   Register Base = Add2Def->getOperand(1).getReg();
1364   Register Imm2 = Add2Def->getOperand(2).getReg();
1365   auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI);
1366   if (!MaybeImm2Val)
1367     return false;
1368 
1369   // Pass the combined immediate to the apply function.
1370   MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value;
1371   MatchInfo.Base = Base;
1372   return true;
1373 }
1374 
1375 bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
1376                                            PtrAddChain &MatchInfo) {
1377   assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
1378   MachineIRBuilder MIB(MI);
1379   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
1380   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
1381   Observer.changingInstr(MI);
1382   MI.getOperand(1).setReg(MatchInfo.Base);
1383   MI.getOperand(2).setReg(NewOffset.getReg(0));
1384   Observer.changedInstr(MI);
1385   return true;
1386 }
1387 
1388 bool CombinerHelper::tryCombine(MachineInstr &MI) {
1389   if (tryCombineCopy(MI))
1390     return true;
1391   if (tryCombineExtendingLoads(MI))
1392     return true;
1393   if (tryCombineIndexedLoadStore(MI))
1394     return true;
1395   return false;
1396 }
1397