xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 62ff619dcc3540659a319be71c9a489f1659e14a)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/GlobalISel/Utils.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/TargetFrameLowering.h"
24 #include "llvm/CodeGen/TargetInstrInfo.h"
25 #include "llvm/CodeGen/TargetLowering.h"
26 #include "llvm/CodeGen/TargetOpcodes.h"
27 #include "llvm/CodeGen/TargetSubtargetInfo.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31 #include "llvm/Support/raw_ostream.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 #define DEBUG_TYPE "legalizer"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace MIPatternMatch;
39 
40 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
41 ///
42 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
43 /// with any leftover piece as type \p LeftoverTy
44 ///
45 /// Returns -1 in the first element of the pair if the breakdown is not
46 /// satisfiable.
47 static std::pair<int, int>
48 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
49   assert(!LeftoverTy.isValid() && "this is an out argument");
50 
51   unsigned Size = OrigTy.getSizeInBits();
52   unsigned NarrowSize = NarrowTy.getSizeInBits();
53   unsigned NumParts = Size / NarrowSize;
54   unsigned LeftoverSize = Size - NumParts * NarrowSize;
55   assert(Size > NarrowSize);
56 
57   if (LeftoverSize == 0)
58     return {NumParts, 0};
59 
60   if (NarrowTy.isVector()) {
61     unsigned EltSize = OrigTy.getScalarSizeInBits();
62     if (LeftoverSize % EltSize != 0)
63       return {-1, -1};
64     LeftoverTy = LLT::scalarOrVector(
65         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
66   } else {
67     LeftoverTy = LLT::scalar(LeftoverSize);
68   }
69 
70   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
71   return std::make_pair(NumParts, NumLeftover);
72 }
73 
74 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
75 
76   if (!Ty.isScalar())
77     return nullptr;
78 
79   switch (Ty.getSizeInBits()) {
80   case 16:
81     return Type::getHalfTy(Ctx);
82   case 32:
83     return Type::getFloatTy(Ctx);
84   case 64:
85     return Type::getDoubleTy(Ctx);
86   case 80:
87     return Type::getX86_FP80Ty(Ctx);
88   case 128:
89     return Type::getFP128Ty(Ctx);
90   default:
91     return nullptr;
92   }
93 }
94 
95 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
96                                  GISelChangeObserver &Observer,
97                                  MachineIRBuilder &Builder)
98     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
99       LI(*MF.getSubtarget().getLegalizerInfo()),
100       TLI(*MF.getSubtarget().getTargetLowering()) { }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &B)
105   : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
106     TLI(*MF.getSubtarget().getTargetLowering()) { }
107 
108 LegalizerHelper::LegalizeResult
109 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
110                                    LostDebugLocObserver &LocObserver) {
111   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
112 
113   MIRBuilder.setInstrAndDebugLoc(MI);
114 
115   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
116       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
117     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
118   auto Step = LI.getAction(MI, MRI);
119   switch (Step.Action) {
120   case Legal:
121     LLVM_DEBUG(dbgs() << ".. Already legal\n");
122     return AlreadyLegal;
123   case Libcall:
124     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
125     return libcall(MI, LocObserver);
126   case NarrowScalar:
127     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
128     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
129   case WidenScalar:
130     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
131     return widenScalar(MI, Step.TypeIdx, Step.NewType);
132   case Bitcast:
133     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
134     return bitcast(MI, Step.TypeIdx, Step.NewType);
135   case Lower:
136     LLVM_DEBUG(dbgs() << ".. Lower\n");
137     return lower(MI, Step.TypeIdx, Step.NewType);
138   case FewerElements:
139     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
140     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
141   case MoreElements:
142     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
143     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
144   case Custom:
145     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
146     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
147   default:
148     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
149     return UnableToLegalize;
150   }
151 }
152 
153 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
154                                    SmallVectorImpl<Register> &VRegs) {
155   for (int i = 0; i < NumParts; ++i)
156     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
157   MIRBuilder.buildUnmerge(VRegs, Reg);
158 }
159 
160 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
161                                    LLT MainTy, LLT &LeftoverTy,
162                                    SmallVectorImpl<Register> &VRegs,
163                                    SmallVectorImpl<Register> &LeftoverRegs) {
164   assert(!LeftoverTy.isValid() && "this is an out argument");
165 
166   unsigned RegSize = RegTy.getSizeInBits();
167   unsigned MainSize = MainTy.getSizeInBits();
168   unsigned NumParts = RegSize / MainSize;
169   unsigned LeftoverSize = RegSize - NumParts * MainSize;
170 
171   // Use an unmerge when possible.
172   if (LeftoverSize == 0) {
173     for (unsigned I = 0; I < NumParts; ++I)
174       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
175     MIRBuilder.buildUnmerge(VRegs, Reg);
176     return true;
177   }
178 
179   // Perform irregular split. Leftover is last element of RegPieces.
180   if (MainTy.isVector()) {
181     SmallVector<Register, 8> RegPieces;
182     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
183     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
184       VRegs.push_back(RegPieces[i]);
185     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
186     LeftoverTy = MRI.getType(LeftoverRegs[0]);
187     return true;
188   }
189 
190   LeftoverTy = LLT::scalar(LeftoverSize);
191   // For irregular sizes, extract the individual parts.
192   for (unsigned I = 0; I != NumParts; ++I) {
193     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
194     VRegs.push_back(NewReg);
195     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
196   }
197 
198   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
199        Offset += LeftoverSize) {
200     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
201     LeftoverRegs.push_back(NewReg);
202     MIRBuilder.buildExtract(NewReg, Reg, Offset);
203   }
204 
205   return true;
206 }
207 
208 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
209                                          SmallVectorImpl<Register> &VRegs) {
210   LLT RegTy = MRI.getType(Reg);
211   assert(RegTy.isVector() && "Expected a vector type");
212 
213   LLT EltTy = RegTy.getElementType();
214   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
215   unsigned RegNumElts = RegTy.getNumElements();
216   unsigned LeftoverNumElts = RegNumElts % NumElts;
217   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
218 
219   // Perfect split without leftover
220   if (LeftoverNumElts == 0)
221     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
222 
223   // Irregular split. Provide direct access to all elements for artifact
224   // combiner using unmerge to elements. Then build vectors with NumElts
225   // elements. Remaining element(s) will be (used to build vector) Leftover.
226   SmallVector<Register, 8> Elts;
227   extractParts(Reg, EltTy, RegNumElts, Elts);
228 
229   unsigned Offset = 0;
230   // Requested sub-vectors of NarrowTy.
231   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
232     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
233     VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
234   }
235 
236   // Leftover element(s).
237   if (LeftoverNumElts == 1) {
238     VRegs.push_back(Elts[Offset]);
239   } else {
240     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
241     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
242     VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0));
243   }
244 }
245 
246 void LegalizerHelper::insertParts(Register DstReg,
247                                   LLT ResultTy, LLT PartTy,
248                                   ArrayRef<Register> PartRegs,
249                                   LLT LeftoverTy,
250                                   ArrayRef<Register> LeftoverRegs) {
251   if (!LeftoverTy.isValid()) {
252     assert(LeftoverRegs.empty());
253 
254     if (!ResultTy.isVector()) {
255       MIRBuilder.buildMerge(DstReg, PartRegs);
256       return;
257     }
258 
259     if (PartTy.isVector())
260       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
261     else
262       MIRBuilder.buildBuildVector(DstReg, PartRegs);
263     return;
264   }
265 
266   // Merge sub-vectors with different number of elements and insert into DstReg.
267   if (ResultTy.isVector()) {
268     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
269     SmallVector<Register, 8> AllRegs;
270     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
271       AllRegs.push_back(Reg);
272     return mergeMixedSubvectors(DstReg, AllRegs);
273   }
274 
275   SmallVector<Register> GCDRegs;
276   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
277   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
278     extractGCDType(GCDRegs, GCDTy, PartReg);
279   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
280   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
281 }
282 
283 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
284                                        Register Reg) {
285   LLT Ty = MRI.getType(Reg);
286   SmallVector<Register, 8> RegElts;
287   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
288   Elts.append(RegElts);
289 }
290 
291 /// Merge \p PartRegs with different types into \p DstReg.
292 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
293                                            ArrayRef<Register> PartRegs) {
294   SmallVector<Register, 8> AllElts;
295   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
296     appendVectorElts(AllElts, PartRegs[i]);
297 
298   Register Leftover = PartRegs[PartRegs.size() - 1];
299   if (MRI.getType(Leftover).isScalar())
300     AllElts.push_back(Leftover);
301   else
302     appendVectorElts(AllElts, Leftover);
303 
304   MIRBuilder.buildMerge(DstReg, AllElts);
305 }
306 
307 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
308 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
309                               const MachineInstr &MI) {
310   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
311 
312   const int StartIdx = Regs.size();
313   const int NumResults = MI.getNumOperands() - 1;
314   Regs.resize(Regs.size() + NumResults);
315   for (int I = 0; I != NumResults; ++I)
316     Regs[StartIdx + I] = MI.getOperand(I).getReg();
317 }
318 
319 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
320                                      LLT GCDTy, Register SrcReg) {
321   LLT SrcTy = MRI.getType(SrcReg);
322   if (SrcTy == GCDTy) {
323     // If the source already evenly divides the result type, we don't need to do
324     // anything.
325     Parts.push_back(SrcReg);
326   } else {
327     // Need to split into common type sized pieces.
328     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
329     getUnmergeResults(Parts, *Unmerge);
330   }
331 }
332 
333 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
334                                     LLT NarrowTy, Register SrcReg) {
335   LLT SrcTy = MRI.getType(SrcReg);
336   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
337   extractGCDType(Parts, GCDTy, SrcReg);
338   return GCDTy;
339 }
340 
341 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
342                                          SmallVectorImpl<Register> &VRegs,
343                                          unsigned PadStrategy) {
344   LLT LCMTy = getLCMType(DstTy, NarrowTy);
345 
346   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
347   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
348   int NumOrigSrc = VRegs.size();
349 
350   Register PadReg;
351 
352   // Get a value we can use to pad the source value if the sources won't evenly
353   // cover the result type.
354   if (NumOrigSrc < NumParts * NumSubParts) {
355     if (PadStrategy == TargetOpcode::G_ZEXT)
356       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
357     else if (PadStrategy == TargetOpcode::G_ANYEXT)
358       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
359     else {
360       assert(PadStrategy == TargetOpcode::G_SEXT);
361 
362       // Shift the sign bit of the low register through the high register.
363       auto ShiftAmt =
364         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
365       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
366     }
367   }
368 
369   // Registers for the final merge to be produced.
370   SmallVector<Register, 4> Remerge(NumParts);
371 
372   // Registers needed for intermediate merges, which will be merged into a
373   // source for Remerge.
374   SmallVector<Register, 4> SubMerge(NumSubParts);
375 
376   // Once we've fully read off the end of the original source bits, we can reuse
377   // the same high bits for remaining padding elements.
378   Register AllPadReg;
379 
380   // Build merges to the LCM type to cover the original result type.
381   for (int I = 0; I != NumParts; ++I) {
382     bool AllMergePartsArePadding = true;
383 
384     // Build the requested merges to the requested type.
385     for (int J = 0; J != NumSubParts; ++J) {
386       int Idx = I * NumSubParts + J;
387       if (Idx >= NumOrigSrc) {
388         SubMerge[J] = PadReg;
389         continue;
390       }
391 
392       SubMerge[J] = VRegs[Idx];
393 
394       // There are meaningful bits here we can't reuse later.
395       AllMergePartsArePadding = false;
396     }
397 
398     // If we've filled up a complete piece with padding bits, we can directly
399     // emit the natural sized constant if applicable, rather than a merge of
400     // smaller constants.
401     if (AllMergePartsArePadding && !AllPadReg) {
402       if (PadStrategy == TargetOpcode::G_ANYEXT)
403         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
404       else if (PadStrategy == TargetOpcode::G_ZEXT)
405         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
406 
407       // If this is a sign extension, we can't materialize a trivial constant
408       // with the right type and have to produce a merge.
409     }
410 
411     if (AllPadReg) {
412       // Avoid creating additional instructions if we're just adding additional
413       // copies of padding bits.
414       Remerge[I] = AllPadReg;
415       continue;
416     }
417 
418     if (NumSubParts == 1)
419       Remerge[I] = SubMerge[0];
420     else
421       Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
422 
423     // In the sign extend padding case, re-use the first all-signbit merge.
424     if (AllMergePartsArePadding && !AllPadReg)
425       AllPadReg = Remerge[I];
426   }
427 
428   VRegs = std::move(Remerge);
429   return LCMTy;
430 }
431 
432 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
433                                                ArrayRef<Register> RemergeRegs) {
434   LLT DstTy = MRI.getType(DstReg);
435 
436   // Create the merge to the widened source, and extract the relevant bits into
437   // the result.
438 
439   if (DstTy == LCMTy) {
440     MIRBuilder.buildMerge(DstReg, RemergeRegs);
441     return;
442   }
443 
444   auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
445   if (DstTy.isScalar() && LCMTy.isScalar()) {
446     MIRBuilder.buildTrunc(DstReg, Remerge);
447     return;
448   }
449 
450   if (LCMTy.isVector()) {
451     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
452     SmallVector<Register, 8> UnmergeDefs(NumDefs);
453     UnmergeDefs[0] = DstReg;
454     for (unsigned I = 1; I != NumDefs; ++I)
455       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
456 
457     MIRBuilder.buildUnmerge(UnmergeDefs,
458                             MIRBuilder.buildMerge(LCMTy, RemergeRegs));
459     return;
460   }
461 
462   llvm_unreachable("unhandled case");
463 }
464 
465 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
466 #define RTLIBCASE_INT(LibcallPrefix)                                           \
467   do {                                                                         \
468     switch (Size) {                                                            \
469     case 32:                                                                   \
470       return RTLIB::LibcallPrefix##32;                                         \
471     case 64:                                                                   \
472       return RTLIB::LibcallPrefix##64;                                         \
473     case 128:                                                                  \
474       return RTLIB::LibcallPrefix##128;                                        \
475     default:                                                                   \
476       llvm_unreachable("unexpected size");                                     \
477     }                                                                          \
478   } while (0)
479 
480 #define RTLIBCASE(LibcallPrefix)                                               \
481   do {                                                                         \
482     switch (Size) {                                                            \
483     case 32:                                                                   \
484       return RTLIB::LibcallPrefix##32;                                         \
485     case 64:                                                                   \
486       return RTLIB::LibcallPrefix##64;                                         \
487     case 80:                                                                   \
488       return RTLIB::LibcallPrefix##80;                                         \
489     case 128:                                                                  \
490       return RTLIB::LibcallPrefix##128;                                        \
491     default:                                                                   \
492       llvm_unreachable("unexpected size");                                     \
493     }                                                                          \
494   } while (0)
495 
496   switch (Opcode) {
497   case TargetOpcode::G_SDIV:
498     RTLIBCASE_INT(SDIV_I);
499   case TargetOpcode::G_UDIV:
500     RTLIBCASE_INT(UDIV_I);
501   case TargetOpcode::G_SREM:
502     RTLIBCASE_INT(SREM_I);
503   case TargetOpcode::G_UREM:
504     RTLIBCASE_INT(UREM_I);
505   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
506     RTLIBCASE_INT(CTLZ_I);
507   case TargetOpcode::G_FADD:
508     RTLIBCASE(ADD_F);
509   case TargetOpcode::G_FSUB:
510     RTLIBCASE(SUB_F);
511   case TargetOpcode::G_FMUL:
512     RTLIBCASE(MUL_F);
513   case TargetOpcode::G_FDIV:
514     RTLIBCASE(DIV_F);
515   case TargetOpcode::G_FEXP:
516     RTLIBCASE(EXP_F);
517   case TargetOpcode::G_FEXP2:
518     RTLIBCASE(EXP2_F);
519   case TargetOpcode::G_FREM:
520     RTLIBCASE(REM_F);
521   case TargetOpcode::G_FPOW:
522     RTLIBCASE(POW_F);
523   case TargetOpcode::G_FMA:
524     RTLIBCASE(FMA_F);
525   case TargetOpcode::G_FSIN:
526     RTLIBCASE(SIN_F);
527   case TargetOpcode::G_FCOS:
528     RTLIBCASE(COS_F);
529   case TargetOpcode::G_FLOG10:
530     RTLIBCASE(LOG10_F);
531   case TargetOpcode::G_FLOG:
532     RTLIBCASE(LOG_F);
533   case TargetOpcode::G_FLOG2:
534     RTLIBCASE(LOG2_F);
535   case TargetOpcode::G_FCEIL:
536     RTLIBCASE(CEIL_F);
537   case TargetOpcode::G_FFLOOR:
538     RTLIBCASE(FLOOR_F);
539   case TargetOpcode::G_FMINNUM:
540     RTLIBCASE(FMIN_F);
541   case TargetOpcode::G_FMAXNUM:
542     RTLIBCASE(FMAX_F);
543   case TargetOpcode::G_FSQRT:
544     RTLIBCASE(SQRT_F);
545   case TargetOpcode::G_FRINT:
546     RTLIBCASE(RINT_F);
547   case TargetOpcode::G_FNEARBYINT:
548     RTLIBCASE(NEARBYINT_F);
549   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
550     RTLIBCASE(ROUNDEVEN_F);
551   }
552   llvm_unreachable("Unknown libcall function");
553 }
554 
555 /// True if an instruction is in tail position in its caller. Intended for
556 /// legalizing libcalls as tail calls when possible.
557 static bool isLibCallInTailPosition(MachineInstr &MI,
558                                     const TargetInstrInfo &TII,
559                                     MachineRegisterInfo &MRI) {
560   MachineBasicBlock &MBB = *MI.getParent();
561   const Function &F = MBB.getParent()->getFunction();
562 
563   // Conservatively require the attributes of the call to match those of
564   // the return. Ignore NoAlias and NonNull because they don't affect the
565   // call sequence.
566   AttributeList CallerAttrs = F.getAttributes();
567   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
568           .removeAttribute(Attribute::NoAlias)
569           .removeAttribute(Attribute::NonNull)
570           .hasAttributes())
571     return false;
572 
573   // It's not safe to eliminate the sign / zero extension of the return value.
574   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
575       CallerAttrs.hasRetAttr(Attribute::SExt))
576     return false;
577 
578   // Only tail call if the following instruction is a standard return or if we
579   // have a `thisreturn` callee, and a sequence like:
580   //
581   //   G_MEMCPY %0, %1, %2
582   //   $x0 = COPY %0
583   //   RET_ReallyLR implicit $x0
584   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
585   if (Next != MBB.instr_end() && Next->isCopy()) {
586     switch (MI.getOpcode()) {
587     default:
588       llvm_unreachable("unsupported opcode");
589     case TargetOpcode::G_BZERO:
590       return false;
591     case TargetOpcode::G_MEMCPY:
592     case TargetOpcode::G_MEMMOVE:
593     case TargetOpcode::G_MEMSET:
594       break;
595     }
596 
597     Register VReg = MI.getOperand(0).getReg();
598     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
599       return false;
600 
601     Register PReg = Next->getOperand(0).getReg();
602     if (!PReg.isPhysical())
603       return false;
604 
605     auto Ret = next_nodbg(Next, MBB.instr_end());
606     if (Ret == MBB.instr_end() || !Ret->isReturn())
607       return false;
608 
609     if (Ret->getNumImplicitOperands() != 1)
610       return false;
611 
612     if (PReg != Ret->getOperand(0).getReg())
613       return false;
614 
615     // Skip over the COPY that we just validated.
616     Next = Ret;
617   }
618 
619   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
620     return false;
621 
622   return true;
623 }
624 
625 LegalizerHelper::LegalizeResult
626 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
627                     const CallLowering::ArgInfo &Result,
628                     ArrayRef<CallLowering::ArgInfo> Args,
629                     const CallingConv::ID CC) {
630   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
631 
632   CallLowering::CallLoweringInfo Info;
633   Info.CallConv = CC;
634   Info.Callee = MachineOperand::CreateES(Name);
635   Info.OrigRet = Result;
636   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
637   if (!CLI.lowerCall(MIRBuilder, Info))
638     return LegalizerHelper::UnableToLegalize;
639 
640   return LegalizerHelper::Legalized;
641 }
642 
643 LegalizerHelper::LegalizeResult
644 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
645                     const CallLowering::ArgInfo &Result,
646                     ArrayRef<CallLowering::ArgInfo> Args) {
647   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
648   const char *Name = TLI.getLibcallName(Libcall);
649   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
650   return createLibcall(MIRBuilder, Name, Result, Args, CC);
651 }
652 
653 // Useful for libcalls where all operands have the same type.
654 static LegalizerHelper::LegalizeResult
655 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
656               Type *OpType) {
657   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
658 
659   // FIXME: What does the original arg index mean here?
660   SmallVector<CallLowering::ArgInfo, 3> Args;
661   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
662     Args.push_back({MO.getReg(), OpType, 0});
663   return createLibcall(MIRBuilder, Libcall,
664                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
665 }
666 
667 LegalizerHelper::LegalizeResult
668 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
669                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
670   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
671 
672   SmallVector<CallLowering::ArgInfo, 3> Args;
673   // Add all the args, except for the last which is an imm denoting 'tail'.
674   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
675     Register Reg = MI.getOperand(i).getReg();
676 
677     // Need derive an IR type for call lowering.
678     LLT OpLLT = MRI.getType(Reg);
679     Type *OpTy = nullptr;
680     if (OpLLT.isPointer())
681       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
682     else
683       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
684     Args.push_back({Reg, OpTy, 0});
685   }
686 
687   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
688   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
689   RTLIB::Libcall RTLibcall;
690   unsigned Opc = MI.getOpcode();
691   switch (Opc) {
692   case TargetOpcode::G_BZERO:
693     RTLibcall = RTLIB::BZERO;
694     break;
695   case TargetOpcode::G_MEMCPY:
696     RTLibcall = RTLIB::MEMCPY;
697     Args[0].Flags[0].setReturned();
698     break;
699   case TargetOpcode::G_MEMMOVE:
700     RTLibcall = RTLIB::MEMMOVE;
701     Args[0].Flags[0].setReturned();
702     break;
703   case TargetOpcode::G_MEMSET:
704     RTLibcall = RTLIB::MEMSET;
705     Args[0].Flags[0].setReturned();
706     break;
707   default:
708     llvm_unreachable("unsupported opcode");
709   }
710   const char *Name = TLI.getLibcallName(RTLibcall);
711 
712   // Unsupported libcall on the target.
713   if (!Name) {
714     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
715                       << MIRBuilder.getTII().getName(Opc) << "\n");
716     return LegalizerHelper::UnableToLegalize;
717   }
718 
719   CallLowering::CallLoweringInfo Info;
720   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
721   Info.Callee = MachineOperand::CreateES(Name);
722   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
723   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
724                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
725 
726   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
727   if (!CLI.lowerCall(MIRBuilder, Info))
728     return LegalizerHelper::UnableToLegalize;
729 
730   if (Info.LoweredTailCall) {
731     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
732 
733     // Check debug locations before removing the return.
734     LocObserver.checkpoint(true);
735 
736     // We must have a return following the call (or debug insts) to get past
737     // isLibCallInTailPosition.
738     do {
739       MachineInstr *Next = MI.getNextNode();
740       assert(Next &&
741              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
742              "Expected instr following MI to be return or debug inst?");
743       // We lowered a tail call, so the call is now the return from the block.
744       // Delete the old return.
745       Next->eraseFromParent();
746     } while (MI.getNextNode());
747 
748     // We expect to lose the debug location from the return.
749     LocObserver.checkpoint(false);
750   }
751 
752   return LegalizerHelper::Legalized;
753 }
754 
755 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
756                                        Type *FromType) {
757   auto ToMVT = MVT::getVT(ToType);
758   auto FromMVT = MVT::getVT(FromType);
759 
760   switch (Opcode) {
761   case TargetOpcode::G_FPEXT:
762     return RTLIB::getFPEXT(FromMVT, ToMVT);
763   case TargetOpcode::G_FPTRUNC:
764     return RTLIB::getFPROUND(FromMVT, ToMVT);
765   case TargetOpcode::G_FPTOSI:
766     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
767   case TargetOpcode::G_FPTOUI:
768     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
769   case TargetOpcode::G_SITOFP:
770     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
771   case TargetOpcode::G_UITOFP:
772     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
773   }
774   llvm_unreachable("Unsupported libcall function");
775 }
776 
777 static LegalizerHelper::LegalizeResult
778 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
779                   Type *FromType) {
780   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
781   return createLibcall(MIRBuilder, Libcall,
782                        {MI.getOperand(0).getReg(), ToType, 0},
783                        {{MI.getOperand(1).getReg(), FromType, 0}});
784 }
785 
786 LegalizerHelper::LegalizeResult
787 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
788   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
789   unsigned Size = LLTy.getSizeInBits();
790   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
791 
792   switch (MI.getOpcode()) {
793   default:
794     return UnableToLegalize;
795   case TargetOpcode::G_SDIV:
796   case TargetOpcode::G_UDIV:
797   case TargetOpcode::G_SREM:
798   case TargetOpcode::G_UREM:
799   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
800     Type *HLTy = IntegerType::get(Ctx, Size);
801     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
802     if (Status != Legalized)
803       return Status;
804     break;
805   }
806   case TargetOpcode::G_FADD:
807   case TargetOpcode::G_FSUB:
808   case TargetOpcode::G_FMUL:
809   case TargetOpcode::G_FDIV:
810   case TargetOpcode::G_FMA:
811   case TargetOpcode::G_FPOW:
812   case TargetOpcode::G_FREM:
813   case TargetOpcode::G_FCOS:
814   case TargetOpcode::G_FSIN:
815   case TargetOpcode::G_FLOG10:
816   case TargetOpcode::G_FLOG:
817   case TargetOpcode::G_FLOG2:
818   case TargetOpcode::G_FEXP:
819   case TargetOpcode::G_FEXP2:
820   case TargetOpcode::G_FCEIL:
821   case TargetOpcode::G_FFLOOR:
822   case TargetOpcode::G_FMINNUM:
823   case TargetOpcode::G_FMAXNUM:
824   case TargetOpcode::G_FSQRT:
825   case TargetOpcode::G_FRINT:
826   case TargetOpcode::G_FNEARBYINT:
827   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
828     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
829     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
830       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
831       return UnableToLegalize;
832     }
833     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
834     if (Status != Legalized)
835       return Status;
836     break;
837   }
838   case TargetOpcode::G_FPEXT:
839   case TargetOpcode::G_FPTRUNC: {
840     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
841     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
842     if (!FromTy || !ToTy)
843       return UnableToLegalize;
844     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
845     if (Status != Legalized)
846       return Status;
847     break;
848   }
849   case TargetOpcode::G_FPTOSI:
850   case TargetOpcode::G_FPTOUI: {
851     // FIXME: Support other types
852     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
853     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
854     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
855       return UnableToLegalize;
856     LegalizeResult Status = conversionLibcall(
857         MI, MIRBuilder,
858         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
859         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
860     if (Status != Legalized)
861       return Status;
862     break;
863   }
864   case TargetOpcode::G_SITOFP:
865   case TargetOpcode::G_UITOFP: {
866     // FIXME: Support other types
867     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
868     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
869     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
870       return UnableToLegalize;
871     LegalizeResult Status = conversionLibcall(
872         MI, MIRBuilder,
873         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
874         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
875     if (Status != Legalized)
876       return Status;
877     break;
878   }
879   case TargetOpcode::G_BZERO:
880   case TargetOpcode::G_MEMCPY:
881   case TargetOpcode::G_MEMMOVE:
882   case TargetOpcode::G_MEMSET: {
883     LegalizeResult Result =
884         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
885     if (Result != Legalized)
886       return Result;
887     MI.eraseFromParent();
888     return Result;
889   }
890   }
891 
892   MI.eraseFromParent();
893   return Legalized;
894 }
895 
896 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
897                                                               unsigned TypeIdx,
898                                                               LLT NarrowTy) {
899   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
900   uint64_t NarrowSize = NarrowTy.getSizeInBits();
901 
902   switch (MI.getOpcode()) {
903   default:
904     return UnableToLegalize;
905   case TargetOpcode::G_IMPLICIT_DEF: {
906     Register DstReg = MI.getOperand(0).getReg();
907     LLT DstTy = MRI.getType(DstReg);
908 
909     // If SizeOp0 is not an exact multiple of NarrowSize, emit
910     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
911     // FIXME: Although this would also be legal for the general case, it causes
912     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
913     //  combines not being hit). This seems to be a problem related to the
914     //  artifact combiner.
915     if (SizeOp0 % NarrowSize != 0) {
916       LLT ImplicitTy = NarrowTy;
917       if (DstTy.isVector())
918         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
919 
920       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
921       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
922 
923       MI.eraseFromParent();
924       return Legalized;
925     }
926 
927     int NumParts = SizeOp0 / NarrowSize;
928 
929     SmallVector<Register, 2> DstRegs;
930     for (int i = 0; i < NumParts; ++i)
931       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
932 
933     if (DstTy.isVector())
934       MIRBuilder.buildBuildVector(DstReg, DstRegs);
935     else
936       MIRBuilder.buildMerge(DstReg, DstRegs);
937     MI.eraseFromParent();
938     return Legalized;
939   }
940   case TargetOpcode::G_CONSTANT: {
941     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
942     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
943     unsigned TotalSize = Ty.getSizeInBits();
944     unsigned NarrowSize = NarrowTy.getSizeInBits();
945     int NumParts = TotalSize / NarrowSize;
946 
947     SmallVector<Register, 4> PartRegs;
948     for (int I = 0; I != NumParts; ++I) {
949       unsigned Offset = I * NarrowSize;
950       auto K = MIRBuilder.buildConstant(NarrowTy,
951                                         Val.lshr(Offset).trunc(NarrowSize));
952       PartRegs.push_back(K.getReg(0));
953     }
954 
955     LLT LeftoverTy;
956     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
957     SmallVector<Register, 1> LeftoverRegs;
958     if (LeftoverBits != 0) {
959       LeftoverTy = LLT::scalar(LeftoverBits);
960       auto K = MIRBuilder.buildConstant(
961         LeftoverTy,
962         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
963       LeftoverRegs.push_back(K.getReg(0));
964     }
965 
966     insertParts(MI.getOperand(0).getReg(),
967                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
968 
969     MI.eraseFromParent();
970     return Legalized;
971   }
972   case TargetOpcode::G_SEXT:
973   case TargetOpcode::G_ZEXT:
974   case TargetOpcode::G_ANYEXT:
975     return narrowScalarExt(MI, TypeIdx, NarrowTy);
976   case TargetOpcode::G_TRUNC: {
977     if (TypeIdx != 1)
978       return UnableToLegalize;
979 
980     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
981     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
982       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
983       return UnableToLegalize;
984     }
985 
986     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
987     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
988     MI.eraseFromParent();
989     return Legalized;
990   }
991 
992   case TargetOpcode::G_FREEZE: {
993     if (TypeIdx != 0)
994       return UnableToLegalize;
995 
996     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
997     // Should widen scalar first
998     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
999       return UnableToLegalize;
1000 
1001     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1002     SmallVector<Register, 8> Parts;
1003     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1004       Parts.push_back(
1005           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1006     }
1007 
1008     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts);
1009     MI.eraseFromParent();
1010     return Legalized;
1011   }
1012   case TargetOpcode::G_ADD:
1013   case TargetOpcode::G_SUB:
1014   case TargetOpcode::G_SADDO:
1015   case TargetOpcode::G_SSUBO:
1016   case TargetOpcode::G_SADDE:
1017   case TargetOpcode::G_SSUBE:
1018   case TargetOpcode::G_UADDO:
1019   case TargetOpcode::G_USUBO:
1020   case TargetOpcode::G_UADDE:
1021   case TargetOpcode::G_USUBE:
1022     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1023   case TargetOpcode::G_MUL:
1024   case TargetOpcode::G_UMULH:
1025     return narrowScalarMul(MI, NarrowTy);
1026   case TargetOpcode::G_EXTRACT:
1027     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1028   case TargetOpcode::G_INSERT:
1029     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1030   case TargetOpcode::G_LOAD: {
1031     auto &LoadMI = cast<GLoad>(MI);
1032     Register DstReg = LoadMI.getDstReg();
1033     LLT DstTy = MRI.getType(DstReg);
1034     if (DstTy.isVector())
1035       return UnableToLegalize;
1036 
1037     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1038       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1039       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1040       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1041       LoadMI.eraseFromParent();
1042       return Legalized;
1043     }
1044 
1045     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1046   }
1047   case TargetOpcode::G_ZEXTLOAD:
1048   case TargetOpcode::G_SEXTLOAD: {
1049     auto &LoadMI = cast<GExtLoad>(MI);
1050     Register DstReg = LoadMI.getDstReg();
1051     Register PtrReg = LoadMI.getPointerReg();
1052 
1053     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1054     auto &MMO = LoadMI.getMMO();
1055     unsigned MemSize = MMO.getSizeInBits();
1056 
1057     if (MemSize == NarrowSize) {
1058       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1059     } else if (MemSize < NarrowSize) {
1060       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1061     } else if (MemSize > NarrowSize) {
1062       // FIXME: Need to split the load.
1063       return UnableToLegalize;
1064     }
1065 
1066     if (isa<GZExtLoad>(LoadMI))
1067       MIRBuilder.buildZExt(DstReg, TmpReg);
1068     else
1069       MIRBuilder.buildSExt(DstReg, TmpReg);
1070 
1071     LoadMI.eraseFromParent();
1072     return Legalized;
1073   }
1074   case TargetOpcode::G_STORE: {
1075     auto &StoreMI = cast<GStore>(MI);
1076 
1077     Register SrcReg = StoreMI.getValueReg();
1078     LLT SrcTy = MRI.getType(SrcReg);
1079     if (SrcTy.isVector())
1080       return UnableToLegalize;
1081 
1082     int NumParts = SizeOp0 / NarrowSize;
1083     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1084     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1085     if (SrcTy.isVector() && LeftoverBits != 0)
1086       return UnableToLegalize;
1087 
1088     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1089       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1090       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1091       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1092       StoreMI.eraseFromParent();
1093       return Legalized;
1094     }
1095 
1096     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1097   }
1098   case TargetOpcode::G_SELECT:
1099     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1100   case TargetOpcode::G_AND:
1101   case TargetOpcode::G_OR:
1102   case TargetOpcode::G_XOR: {
1103     // Legalize bitwise operation:
1104     // A = BinOp<Ty> B, C
1105     // into:
1106     // B1, ..., BN = G_UNMERGE_VALUES B
1107     // C1, ..., CN = G_UNMERGE_VALUES C
1108     // A1 = BinOp<Ty/N> B1, C2
1109     // ...
1110     // AN = BinOp<Ty/N> BN, CN
1111     // A = G_MERGE_VALUES A1, ..., AN
1112     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1113   }
1114   case TargetOpcode::G_SHL:
1115   case TargetOpcode::G_LSHR:
1116   case TargetOpcode::G_ASHR:
1117     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1118   case TargetOpcode::G_CTLZ:
1119   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1120   case TargetOpcode::G_CTTZ:
1121   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1122   case TargetOpcode::G_CTPOP:
1123     if (TypeIdx == 1)
1124       switch (MI.getOpcode()) {
1125       case TargetOpcode::G_CTLZ:
1126       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1127         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1128       case TargetOpcode::G_CTTZ:
1129       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1130         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1131       case TargetOpcode::G_CTPOP:
1132         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1133       default:
1134         return UnableToLegalize;
1135       }
1136 
1137     Observer.changingInstr(MI);
1138     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1139     Observer.changedInstr(MI);
1140     return Legalized;
1141   case TargetOpcode::G_INTTOPTR:
1142     if (TypeIdx != 1)
1143       return UnableToLegalize;
1144 
1145     Observer.changingInstr(MI);
1146     narrowScalarSrc(MI, NarrowTy, 1);
1147     Observer.changedInstr(MI);
1148     return Legalized;
1149   case TargetOpcode::G_PTRTOINT:
1150     if (TypeIdx != 0)
1151       return UnableToLegalize;
1152 
1153     Observer.changingInstr(MI);
1154     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1155     Observer.changedInstr(MI);
1156     return Legalized;
1157   case TargetOpcode::G_PHI: {
1158     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1159     // NarrowSize.
1160     if (SizeOp0 % NarrowSize != 0)
1161       return UnableToLegalize;
1162 
1163     unsigned NumParts = SizeOp0 / NarrowSize;
1164     SmallVector<Register, 2> DstRegs(NumParts);
1165     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1166     Observer.changingInstr(MI);
1167     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1168       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1169       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
1170       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1171                    SrcRegs[i / 2]);
1172     }
1173     MachineBasicBlock &MBB = *MI.getParent();
1174     MIRBuilder.setInsertPt(MBB, MI);
1175     for (unsigned i = 0; i < NumParts; ++i) {
1176       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1177       MachineInstrBuilder MIB =
1178           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1179       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1180         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1181     }
1182     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1183     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1184     Observer.changedInstr(MI);
1185     MI.eraseFromParent();
1186     return Legalized;
1187   }
1188   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1189   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1190     if (TypeIdx != 2)
1191       return UnableToLegalize;
1192 
1193     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1194     Observer.changingInstr(MI);
1195     narrowScalarSrc(MI, NarrowTy, OpIdx);
1196     Observer.changedInstr(MI);
1197     return Legalized;
1198   }
1199   case TargetOpcode::G_ICMP: {
1200     Register LHS = MI.getOperand(2).getReg();
1201     LLT SrcTy = MRI.getType(LHS);
1202     uint64_t SrcSize = SrcTy.getSizeInBits();
1203     CmpInst::Predicate Pred =
1204         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1205 
1206     // TODO: Handle the non-equality case for weird sizes.
1207     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1208       return UnableToLegalize;
1209 
1210     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1211     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1212     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1213                       LHSLeftoverRegs))
1214       return UnableToLegalize;
1215 
1216     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1217     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1218     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1219                       RHSPartRegs, RHSLeftoverRegs))
1220       return UnableToLegalize;
1221 
1222     // We now have the LHS and RHS of the compare split into narrow-type
1223     // registers, plus potentially some leftover type.
1224     Register Dst = MI.getOperand(0).getReg();
1225     LLT ResTy = MRI.getType(Dst);
1226     if (ICmpInst::isEquality(Pred)) {
1227       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1228       // them together. For each equal part, the result should be all 0s. For
1229       // each non-equal part, we'll get at least one 1.
1230       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1231       SmallVector<Register, 4> Xors;
1232       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1233         auto LHS = std::get<0>(LHSAndRHS);
1234         auto RHS = std::get<1>(LHSAndRHS);
1235         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1236         Xors.push_back(Xor);
1237       }
1238 
1239       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1240       // to the desired narrow type so that we can OR them together later.
1241       SmallVector<Register, 4> WidenedXors;
1242       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1243         auto LHS = std::get<0>(LHSAndRHS);
1244         auto RHS = std::get<1>(LHSAndRHS);
1245         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1246         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1247         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1248                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1249         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1250       }
1251 
1252       // Now, for each part we broke up, we know if they are equal/not equal
1253       // based off the G_XOR. We can OR these all together and compare against
1254       // 0 to get the result.
1255       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1256       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1257       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1258         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1259       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1260     } else {
1261       // TODO: Handle non-power-of-two types.
1262       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1263       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1264       Register LHSL = LHSPartRegs[0];
1265       Register LHSH = LHSPartRegs[1];
1266       Register RHSL = RHSPartRegs[0];
1267       Register RHSH = RHSPartRegs[1];
1268       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1269       MachineInstrBuilder CmpHEQ =
1270           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1271       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1272           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1273       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1274     }
1275     MI.eraseFromParent();
1276     return Legalized;
1277   }
1278   case TargetOpcode::G_SEXT_INREG: {
1279     if (TypeIdx != 0)
1280       return UnableToLegalize;
1281 
1282     int64_t SizeInBits = MI.getOperand(2).getImm();
1283 
1284     // So long as the new type has more bits than the bits we're extending we
1285     // don't need to break it apart.
1286     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1287       Observer.changingInstr(MI);
1288       // We don't lose any non-extension bits by truncating the src and
1289       // sign-extending the dst.
1290       MachineOperand &MO1 = MI.getOperand(1);
1291       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1292       MO1.setReg(TruncMIB.getReg(0));
1293 
1294       MachineOperand &MO2 = MI.getOperand(0);
1295       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1296       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1297       MIRBuilder.buildSExt(MO2, DstExt);
1298       MO2.setReg(DstExt);
1299       Observer.changedInstr(MI);
1300       return Legalized;
1301     }
1302 
1303     // Break it apart. Components below the extension point are unmodified. The
1304     // component containing the extension point becomes a narrower SEXT_INREG.
1305     // Components above it are ashr'd from the component containing the
1306     // extension point.
1307     if (SizeOp0 % NarrowSize != 0)
1308       return UnableToLegalize;
1309     int NumParts = SizeOp0 / NarrowSize;
1310 
1311     // List the registers where the destination will be scattered.
1312     SmallVector<Register, 2> DstRegs;
1313     // List the registers where the source will be split.
1314     SmallVector<Register, 2> SrcRegs;
1315 
1316     // Create all the temporary registers.
1317     for (int i = 0; i < NumParts; ++i) {
1318       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1319 
1320       SrcRegs.push_back(SrcReg);
1321     }
1322 
1323     // Explode the big arguments into smaller chunks.
1324     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1325 
1326     Register AshrCstReg =
1327         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1328             .getReg(0);
1329     Register FullExtensionReg = 0;
1330     Register PartialExtensionReg = 0;
1331 
1332     // Do the operation on each small part.
1333     for (int i = 0; i < NumParts; ++i) {
1334       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1335         DstRegs.push_back(SrcRegs[i]);
1336       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1337         assert(PartialExtensionReg &&
1338                "Expected to visit partial extension before full");
1339         if (FullExtensionReg) {
1340           DstRegs.push_back(FullExtensionReg);
1341           continue;
1342         }
1343         DstRegs.push_back(
1344             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1345                 .getReg(0));
1346         FullExtensionReg = DstRegs.back();
1347       } else {
1348         DstRegs.push_back(
1349             MIRBuilder
1350                 .buildInstr(
1351                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1352                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1353                 .getReg(0));
1354         PartialExtensionReg = DstRegs.back();
1355       }
1356     }
1357 
1358     // Gather the destination registers into the final destination.
1359     Register DstReg = MI.getOperand(0).getReg();
1360     MIRBuilder.buildMerge(DstReg, DstRegs);
1361     MI.eraseFromParent();
1362     return Legalized;
1363   }
1364   case TargetOpcode::G_BSWAP:
1365   case TargetOpcode::G_BITREVERSE: {
1366     if (SizeOp0 % NarrowSize != 0)
1367       return UnableToLegalize;
1368 
1369     Observer.changingInstr(MI);
1370     SmallVector<Register, 2> SrcRegs, DstRegs;
1371     unsigned NumParts = SizeOp0 / NarrowSize;
1372     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1373 
1374     for (unsigned i = 0; i < NumParts; ++i) {
1375       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1376                                            {SrcRegs[NumParts - 1 - i]});
1377       DstRegs.push_back(DstPart.getReg(0));
1378     }
1379 
1380     MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
1381 
1382     Observer.changedInstr(MI);
1383     MI.eraseFromParent();
1384     return Legalized;
1385   }
1386   case TargetOpcode::G_PTR_ADD:
1387   case TargetOpcode::G_PTRMASK: {
1388     if (TypeIdx != 1)
1389       return UnableToLegalize;
1390     Observer.changingInstr(MI);
1391     narrowScalarSrc(MI, NarrowTy, 2);
1392     Observer.changedInstr(MI);
1393     return Legalized;
1394   }
1395   case TargetOpcode::G_FPTOUI:
1396   case TargetOpcode::G_FPTOSI:
1397     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1398   case TargetOpcode::G_FPEXT:
1399     if (TypeIdx != 0)
1400       return UnableToLegalize;
1401     Observer.changingInstr(MI);
1402     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1403     Observer.changedInstr(MI);
1404     return Legalized;
1405   }
1406 }
1407 
1408 Register LegalizerHelper::coerceToScalar(Register Val) {
1409   LLT Ty = MRI.getType(Val);
1410   if (Ty.isScalar())
1411     return Val;
1412 
1413   const DataLayout &DL = MIRBuilder.getDataLayout();
1414   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1415   if (Ty.isPointer()) {
1416     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1417       return Register();
1418     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1419   }
1420 
1421   Register NewVal = Val;
1422 
1423   assert(Ty.isVector());
1424   LLT EltTy = Ty.getElementType();
1425   if (EltTy.isPointer())
1426     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1427   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1428 }
1429 
1430 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1431                                      unsigned OpIdx, unsigned ExtOpcode) {
1432   MachineOperand &MO = MI.getOperand(OpIdx);
1433   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1434   MO.setReg(ExtB.getReg(0));
1435 }
1436 
1437 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1438                                       unsigned OpIdx) {
1439   MachineOperand &MO = MI.getOperand(OpIdx);
1440   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1441   MO.setReg(ExtB.getReg(0));
1442 }
1443 
1444 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1445                                      unsigned OpIdx, unsigned TruncOpcode) {
1446   MachineOperand &MO = MI.getOperand(OpIdx);
1447   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1448   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1449   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1450   MO.setReg(DstExt);
1451 }
1452 
1453 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1454                                       unsigned OpIdx, unsigned ExtOpcode) {
1455   MachineOperand &MO = MI.getOperand(OpIdx);
1456   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1457   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1458   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1459   MO.setReg(DstTrunc);
1460 }
1461 
1462 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1463                                             unsigned OpIdx) {
1464   MachineOperand &MO = MI.getOperand(OpIdx);
1465   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1466   Register Dst = MO.getReg();
1467   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1468   MO.setReg(DstExt);
1469   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1470 }
1471 
1472 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1473                                             unsigned OpIdx) {
1474   MachineOperand &MO = MI.getOperand(OpIdx);
1475   SmallVector<Register, 8> Regs;
1476   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1477 }
1478 
1479 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1480   MachineOperand &Op = MI.getOperand(OpIdx);
1481   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1482 }
1483 
1484 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1485   MachineOperand &MO = MI.getOperand(OpIdx);
1486   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1487   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1488   MIRBuilder.buildBitcast(MO, CastDst);
1489   MO.setReg(CastDst);
1490 }
1491 
1492 LegalizerHelper::LegalizeResult
1493 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1494                                         LLT WideTy) {
1495   if (TypeIdx != 1)
1496     return UnableToLegalize;
1497 
1498   Register DstReg = MI.getOperand(0).getReg();
1499   LLT DstTy = MRI.getType(DstReg);
1500   if (DstTy.isVector())
1501     return UnableToLegalize;
1502 
1503   Register Src1 = MI.getOperand(1).getReg();
1504   LLT SrcTy = MRI.getType(Src1);
1505   const int DstSize = DstTy.getSizeInBits();
1506   const int SrcSize = SrcTy.getSizeInBits();
1507   const int WideSize = WideTy.getSizeInBits();
1508   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1509 
1510   unsigned NumOps = MI.getNumOperands();
1511   unsigned NumSrc = MI.getNumOperands() - 1;
1512   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1513 
1514   if (WideSize >= DstSize) {
1515     // Directly pack the bits in the target type.
1516     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1).getReg(0);
1517 
1518     for (unsigned I = 2; I != NumOps; ++I) {
1519       const unsigned Offset = (I - 1) * PartSize;
1520 
1521       Register SrcReg = MI.getOperand(I).getReg();
1522       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1523 
1524       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1525 
1526       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1527         MRI.createGenericVirtualRegister(WideTy);
1528 
1529       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1530       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1531       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1532       ResultReg = NextResult;
1533     }
1534 
1535     if (WideSize > DstSize)
1536       MIRBuilder.buildTrunc(DstReg, ResultReg);
1537     else if (DstTy.isPointer())
1538       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1539 
1540     MI.eraseFromParent();
1541     return Legalized;
1542   }
1543 
1544   // Unmerge the original values to the GCD type, and recombine to the next
1545   // multiple greater than the original type.
1546   //
1547   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1548   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1549   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1550   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1551   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1552   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1553   // %12:_(s12) = G_MERGE_VALUES %10, %11
1554   //
1555   // Padding with undef if necessary:
1556   //
1557   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1558   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1559   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1560   // %7:_(s2) = G_IMPLICIT_DEF
1561   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1562   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1563   // %10:_(s12) = G_MERGE_VALUES %8, %9
1564 
1565   const int GCD = greatestCommonDivisor(SrcSize, WideSize);
1566   LLT GCDTy = LLT::scalar(GCD);
1567 
1568   SmallVector<Register, 8> Parts;
1569   SmallVector<Register, 8> NewMergeRegs;
1570   SmallVector<Register, 8> Unmerges;
1571   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1572 
1573   // Decompose the original operands if they don't evenly divide.
1574   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1575     Register SrcReg = MO.getReg();
1576     if (GCD == SrcSize) {
1577       Unmerges.push_back(SrcReg);
1578     } else {
1579       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1580       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1581         Unmerges.push_back(Unmerge.getReg(J));
1582     }
1583   }
1584 
1585   // Pad with undef to the next size that is a multiple of the requested size.
1586   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1587     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1588     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1589       Unmerges.push_back(UndefReg);
1590   }
1591 
1592   const int PartsPerGCD = WideSize / GCD;
1593 
1594   // Build merges of each piece.
1595   ArrayRef<Register> Slicer(Unmerges);
1596   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1597     auto Merge = MIRBuilder.buildMerge(WideTy, Slicer.take_front(PartsPerGCD));
1598     NewMergeRegs.push_back(Merge.getReg(0));
1599   }
1600 
1601   // A truncate may be necessary if the requested type doesn't evenly divide the
1602   // original result type.
1603   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1604     MIRBuilder.buildMerge(DstReg, NewMergeRegs);
1605   } else {
1606     auto FinalMerge = MIRBuilder.buildMerge(WideDstTy, NewMergeRegs);
1607     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1608   }
1609 
1610   MI.eraseFromParent();
1611   return Legalized;
1612 }
1613 
1614 Register LegalizerHelper::widenWithUnmerge(LLT WideTy, Register OrigReg) {
1615   Register WideReg = MRI.createGenericVirtualRegister(WideTy);
1616   LLT OrigTy = MRI.getType(OrigReg);
1617   LLT LCMTy = getLCMType(WideTy, OrigTy);
1618 
1619   const int NumMergeParts = LCMTy.getSizeInBits() / WideTy.getSizeInBits();
1620   const int NumUnmergeParts = LCMTy.getSizeInBits() / OrigTy.getSizeInBits();
1621 
1622   Register UnmergeSrc = WideReg;
1623 
1624   // Create a merge to the LCM type, padding with undef
1625   // %0:_(<3 x s32>) = G_FOO => <4 x s32>
1626   // =>
1627   // %1:_(<4 x s32>) = G_FOO
1628   // %2:_(<4 x s32>) = G_IMPLICIT_DEF
1629   // %3:_(<12 x s32>) = G_CONCAT_VECTORS %1, %2, %2
1630   // %0:_(<3 x s32>), %4:_, %5:_, %6:_ = G_UNMERGE_VALUES %3
1631   if (NumMergeParts > 1) {
1632     Register Undef = MIRBuilder.buildUndef(WideTy).getReg(0);
1633     SmallVector<Register, 8> MergeParts(NumMergeParts, Undef);
1634     MergeParts[0] = WideReg;
1635     UnmergeSrc = MIRBuilder.buildMerge(LCMTy, MergeParts).getReg(0);
1636   }
1637 
1638   // Unmerge to the original register and pad with dead defs.
1639   SmallVector<Register, 8> UnmergeResults(NumUnmergeParts);
1640   UnmergeResults[0] = OrigReg;
1641   for (int I = 1; I != NumUnmergeParts; ++I)
1642     UnmergeResults[I] = MRI.createGenericVirtualRegister(OrigTy);
1643 
1644   MIRBuilder.buildUnmerge(UnmergeResults, UnmergeSrc);
1645   return WideReg;
1646 }
1647 
1648 LegalizerHelper::LegalizeResult
1649 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1650                                           LLT WideTy) {
1651   if (TypeIdx != 0)
1652     return UnableToLegalize;
1653 
1654   int NumDst = MI.getNumOperands() - 1;
1655   Register SrcReg = MI.getOperand(NumDst).getReg();
1656   LLT SrcTy = MRI.getType(SrcReg);
1657   if (SrcTy.isVector())
1658     return UnableToLegalize;
1659 
1660   Register Dst0Reg = MI.getOperand(0).getReg();
1661   LLT DstTy = MRI.getType(Dst0Reg);
1662   if (!DstTy.isScalar())
1663     return UnableToLegalize;
1664 
1665   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1666     if (SrcTy.isPointer()) {
1667       const DataLayout &DL = MIRBuilder.getDataLayout();
1668       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1669         LLVM_DEBUG(
1670             dbgs() << "Not casting non-integral address space integer\n");
1671         return UnableToLegalize;
1672       }
1673 
1674       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1675       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1676     }
1677 
1678     // Widen SrcTy to WideTy. This does not affect the result, but since the
1679     // user requested this size, it is probably better handled than SrcTy and
1680     // should reduce the total number of legalization artifacts.
1681     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1682       SrcTy = WideTy;
1683       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1684     }
1685 
1686     // Theres no unmerge type to target. Directly extract the bits from the
1687     // source type
1688     unsigned DstSize = DstTy.getSizeInBits();
1689 
1690     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1691     for (int I = 1; I != NumDst; ++I) {
1692       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1693       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1694       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1695     }
1696 
1697     MI.eraseFromParent();
1698     return Legalized;
1699   }
1700 
1701   // Extend the source to a wider type.
1702   LLT LCMTy = getLCMType(SrcTy, WideTy);
1703 
1704   Register WideSrc = SrcReg;
1705   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1706     // TODO: If this is an integral address space, cast to integer and anyext.
1707     if (SrcTy.isPointer()) {
1708       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1709       return UnableToLegalize;
1710     }
1711 
1712     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1713   }
1714 
1715   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1716 
1717   // Create a sequence of unmerges and merges to the original results. Since we
1718   // may have widened the source, we will need to pad the results with dead defs
1719   // to cover the source register.
1720   // e.g. widen s48 to s64:
1721   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1722   //
1723   // =>
1724   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1725   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1726   //  ; unpack to GCD type, with extra dead defs
1727   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1728   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1729   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1730   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1731   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1732   const LLT GCDTy = getGCDType(WideTy, DstTy);
1733   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1734   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1735 
1736   // Directly unmerge to the destination without going through a GCD type
1737   // if possible
1738   if (PartsPerRemerge == 1) {
1739     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1740 
1741     for (int I = 0; I != NumUnmerge; ++I) {
1742       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1743 
1744       for (int J = 0; J != PartsPerUnmerge; ++J) {
1745         int Idx = I * PartsPerUnmerge + J;
1746         if (Idx < NumDst)
1747           MIB.addDef(MI.getOperand(Idx).getReg());
1748         else {
1749           // Create dead def for excess components.
1750           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1751         }
1752       }
1753 
1754       MIB.addUse(Unmerge.getReg(I));
1755     }
1756   } else {
1757     SmallVector<Register, 16> Parts;
1758     for (int J = 0; J != NumUnmerge; ++J)
1759       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1760 
1761     SmallVector<Register, 8> RemergeParts;
1762     for (int I = 0; I != NumDst; ++I) {
1763       for (int J = 0; J < PartsPerRemerge; ++J) {
1764         const int Idx = I * PartsPerRemerge + J;
1765         RemergeParts.emplace_back(Parts[Idx]);
1766       }
1767 
1768       MIRBuilder.buildMerge(MI.getOperand(I).getReg(), RemergeParts);
1769       RemergeParts.clear();
1770     }
1771   }
1772 
1773   MI.eraseFromParent();
1774   return Legalized;
1775 }
1776 
1777 LegalizerHelper::LegalizeResult
1778 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1779                                     LLT WideTy) {
1780   Register DstReg = MI.getOperand(0).getReg();
1781   Register SrcReg = MI.getOperand(1).getReg();
1782   LLT SrcTy = MRI.getType(SrcReg);
1783 
1784   LLT DstTy = MRI.getType(DstReg);
1785   unsigned Offset = MI.getOperand(2).getImm();
1786 
1787   if (TypeIdx == 0) {
1788     if (SrcTy.isVector() || DstTy.isVector())
1789       return UnableToLegalize;
1790 
1791     SrcOp Src(SrcReg);
1792     if (SrcTy.isPointer()) {
1793       // Extracts from pointers can be handled only if they are really just
1794       // simple integers.
1795       const DataLayout &DL = MIRBuilder.getDataLayout();
1796       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1797         return UnableToLegalize;
1798 
1799       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1800       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1801       SrcTy = SrcAsIntTy;
1802     }
1803 
1804     if (DstTy.isPointer())
1805       return UnableToLegalize;
1806 
1807     if (Offset == 0) {
1808       // Avoid a shift in the degenerate case.
1809       MIRBuilder.buildTrunc(DstReg,
1810                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1811       MI.eraseFromParent();
1812       return Legalized;
1813     }
1814 
1815     // Do a shift in the source type.
1816     LLT ShiftTy = SrcTy;
1817     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1818       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1819       ShiftTy = WideTy;
1820     }
1821 
1822     auto LShr = MIRBuilder.buildLShr(
1823       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1824     MIRBuilder.buildTrunc(DstReg, LShr);
1825     MI.eraseFromParent();
1826     return Legalized;
1827   }
1828 
1829   if (SrcTy.isScalar()) {
1830     Observer.changingInstr(MI);
1831     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1832     Observer.changedInstr(MI);
1833     return Legalized;
1834   }
1835 
1836   if (!SrcTy.isVector())
1837     return UnableToLegalize;
1838 
1839   if (DstTy != SrcTy.getElementType())
1840     return UnableToLegalize;
1841 
1842   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1843     return UnableToLegalize;
1844 
1845   Observer.changingInstr(MI);
1846   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1847 
1848   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1849                           Offset);
1850   widenScalarDst(MI, WideTy.getScalarType(), 0);
1851   Observer.changedInstr(MI);
1852   return Legalized;
1853 }
1854 
1855 LegalizerHelper::LegalizeResult
1856 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1857                                    LLT WideTy) {
1858   if (TypeIdx != 0 || WideTy.isVector())
1859     return UnableToLegalize;
1860   Observer.changingInstr(MI);
1861   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1862   widenScalarDst(MI, WideTy);
1863   Observer.changedInstr(MI);
1864   return Legalized;
1865 }
1866 
1867 LegalizerHelper::LegalizeResult
1868 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1869                                            LLT WideTy) {
1870   if (TypeIdx == 1)
1871     return UnableToLegalize; // TODO
1872 
1873   unsigned Opcode;
1874   unsigned ExtOpcode;
1875   Optional<Register> CarryIn = None;
1876   switch (MI.getOpcode()) {
1877   default:
1878     llvm_unreachable("Unexpected opcode!");
1879   case TargetOpcode::G_SADDO:
1880     Opcode = TargetOpcode::G_ADD;
1881     ExtOpcode = TargetOpcode::G_SEXT;
1882     break;
1883   case TargetOpcode::G_SSUBO:
1884     Opcode = TargetOpcode::G_SUB;
1885     ExtOpcode = TargetOpcode::G_SEXT;
1886     break;
1887   case TargetOpcode::G_UADDO:
1888     Opcode = TargetOpcode::G_ADD;
1889     ExtOpcode = TargetOpcode::G_ZEXT;
1890     break;
1891   case TargetOpcode::G_USUBO:
1892     Opcode = TargetOpcode::G_SUB;
1893     ExtOpcode = TargetOpcode::G_ZEXT;
1894     break;
1895   case TargetOpcode::G_SADDE:
1896     Opcode = TargetOpcode::G_UADDE;
1897     ExtOpcode = TargetOpcode::G_SEXT;
1898     CarryIn = MI.getOperand(4).getReg();
1899     break;
1900   case TargetOpcode::G_SSUBE:
1901     Opcode = TargetOpcode::G_USUBE;
1902     ExtOpcode = TargetOpcode::G_SEXT;
1903     CarryIn = MI.getOperand(4).getReg();
1904     break;
1905   case TargetOpcode::G_UADDE:
1906     Opcode = TargetOpcode::G_UADDE;
1907     ExtOpcode = TargetOpcode::G_ZEXT;
1908     CarryIn = MI.getOperand(4).getReg();
1909     break;
1910   case TargetOpcode::G_USUBE:
1911     Opcode = TargetOpcode::G_USUBE;
1912     ExtOpcode = TargetOpcode::G_ZEXT;
1913     CarryIn = MI.getOperand(4).getReg();
1914     break;
1915   }
1916 
1917   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1918   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1919   // Do the arithmetic in the larger type.
1920   Register NewOp;
1921   if (CarryIn) {
1922     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1923     NewOp = MIRBuilder
1924                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1925                             {LHSExt, RHSExt, *CarryIn})
1926                 .getReg(0);
1927   } else {
1928     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1929   }
1930   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1931   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1932   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1933   // There is no overflow if the ExtOp is the same as NewOp.
1934   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1935   // Now trunc the NewOp to the original result.
1936   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1937   MI.eraseFromParent();
1938   return Legalized;
1939 }
1940 
1941 LegalizerHelper::LegalizeResult
1942 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1943                                          LLT WideTy) {
1944   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1945                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1946                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1947   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1948                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1949   // We can convert this to:
1950   //   1. Any extend iN to iM
1951   //   2. SHL by M-N
1952   //   3. [US][ADD|SUB|SHL]SAT
1953   //   4. L/ASHR by M-N
1954   //
1955   // It may be more efficient to lower this to a min and a max operation in
1956   // the higher precision arithmetic if the promoted operation isn't legal,
1957   // but this decision is up to the target's lowering request.
1958   Register DstReg = MI.getOperand(0).getReg();
1959 
1960   unsigned NewBits = WideTy.getScalarSizeInBits();
1961   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1962 
1963   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1964   // must not left shift the RHS to preserve the shift amount.
1965   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1966   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1967                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1968   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1969   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1970   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1971 
1972   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1973                                         {ShiftL, ShiftR}, MI.getFlags());
1974 
1975   // Use a shift that will preserve the number of sign bits when the trunc is
1976   // folded away.
1977   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1978                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1979 
1980   MIRBuilder.buildTrunc(DstReg, Result);
1981   MI.eraseFromParent();
1982   return Legalized;
1983 }
1984 
1985 LegalizerHelper::LegalizeResult
1986 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1987                                  LLT WideTy) {
1988   if (TypeIdx == 1)
1989     return UnableToLegalize;
1990 
1991   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1992   Register Result = MI.getOperand(0).getReg();
1993   Register OriginalOverflow = MI.getOperand(1).getReg();
1994   Register LHS = MI.getOperand(2).getReg();
1995   Register RHS = MI.getOperand(3).getReg();
1996   LLT SrcTy = MRI.getType(LHS);
1997   LLT OverflowTy = MRI.getType(OriginalOverflow);
1998   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1999 
2000   // To determine if the result overflowed in the larger type, we extend the
2001   // input to the larger type, do the multiply (checking if it overflows),
2002   // then also check the high bits of the result to see if overflow happened
2003   // there.
2004   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2005   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2006   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2007 
2008   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
2009                                     {LeftOperand, RightOperand});
2010   auto Mul = Mulo->getOperand(0);
2011   MIRBuilder.buildTrunc(Result, Mul);
2012 
2013   MachineInstrBuilder ExtResult;
2014   // Overflow occurred if it occurred in the larger type, or if the high part
2015   // of the result does not zero/sign-extend the low part.  Check this second
2016   // possibility first.
2017   if (IsSigned) {
2018     // For signed, overflow occurred when the high part does not sign-extend
2019     // the low part.
2020     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2021   } else {
2022     // Unsigned overflow occurred when the high part does not zero-extend the
2023     // low part.
2024     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2025   }
2026 
2027   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2028   // so we don't need to check the overflow result of larger type Mulo.
2029   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2030     auto Overflow =
2031         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2032     // Finally check if the multiplication in the larger type itself overflowed.
2033     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2034   } else {
2035     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2036   }
2037   MI.eraseFromParent();
2038   return Legalized;
2039 }
2040 
2041 LegalizerHelper::LegalizeResult
2042 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2043   switch (MI.getOpcode()) {
2044   default:
2045     return UnableToLegalize;
2046   case TargetOpcode::G_ATOMICRMW_XCHG:
2047   case TargetOpcode::G_ATOMICRMW_ADD:
2048   case TargetOpcode::G_ATOMICRMW_SUB:
2049   case TargetOpcode::G_ATOMICRMW_AND:
2050   case TargetOpcode::G_ATOMICRMW_OR:
2051   case TargetOpcode::G_ATOMICRMW_XOR:
2052   case TargetOpcode::G_ATOMICRMW_MIN:
2053   case TargetOpcode::G_ATOMICRMW_MAX:
2054   case TargetOpcode::G_ATOMICRMW_UMIN:
2055   case TargetOpcode::G_ATOMICRMW_UMAX:
2056     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2057     Observer.changingInstr(MI);
2058     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2059     widenScalarDst(MI, WideTy, 0);
2060     Observer.changedInstr(MI);
2061     return Legalized;
2062   case TargetOpcode::G_ATOMIC_CMPXCHG:
2063     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2064     Observer.changingInstr(MI);
2065     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2066     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2067     widenScalarDst(MI, WideTy, 0);
2068     Observer.changedInstr(MI);
2069     return Legalized;
2070   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2071     if (TypeIdx == 0) {
2072       Observer.changingInstr(MI);
2073       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2074       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2075       widenScalarDst(MI, WideTy, 0);
2076       Observer.changedInstr(MI);
2077       return Legalized;
2078     }
2079     assert(TypeIdx == 1 &&
2080            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2081     Observer.changingInstr(MI);
2082     widenScalarDst(MI, WideTy, 1);
2083     Observer.changedInstr(MI);
2084     return Legalized;
2085   case TargetOpcode::G_EXTRACT:
2086     return widenScalarExtract(MI, TypeIdx, WideTy);
2087   case TargetOpcode::G_INSERT:
2088     return widenScalarInsert(MI, TypeIdx, WideTy);
2089   case TargetOpcode::G_MERGE_VALUES:
2090     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2091   case TargetOpcode::G_UNMERGE_VALUES:
2092     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2093   case TargetOpcode::G_SADDO:
2094   case TargetOpcode::G_SSUBO:
2095   case TargetOpcode::G_UADDO:
2096   case TargetOpcode::G_USUBO:
2097   case TargetOpcode::G_SADDE:
2098   case TargetOpcode::G_SSUBE:
2099   case TargetOpcode::G_UADDE:
2100   case TargetOpcode::G_USUBE:
2101     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2102   case TargetOpcode::G_UMULO:
2103   case TargetOpcode::G_SMULO:
2104     return widenScalarMulo(MI, TypeIdx, WideTy);
2105   case TargetOpcode::G_SADDSAT:
2106   case TargetOpcode::G_SSUBSAT:
2107   case TargetOpcode::G_SSHLSAT:
2108   case TargetOpcode::G_UADDSAT:
2109   case TargetOpcode::G_USUBSAT:
2110   case TargetOpcode::G_USHLSAT:
2111     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2112   case TargetOpcode::G_CTTZ:
2113   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2114   case TargetOpcode::G_CTLZ:
2115   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2116   case TargetOpcode::G_CTPOP: {
2117     if (TypeIdx == 0) {
2118       Observer.changingInstr(MI);
2119       widenScalarDst(MI, WideTy, 0);
2120       Observer.changedInstr(MI);
2121       return Legalized;
2122     }
2123 
2124     Register SrcReg = MI.getOperand(1).getReg();
2125 
2126     // First extend the input.
2127     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2128                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2129                           ? TargetOpcode::G_ANYEXT
2130                           : TargetOpcode::G_ZEXT;
2131     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2132     LLT CurTy = MRI.getType(SrcReg);
2133     unsigned NewOpc = MI.getOpcode();
2134     if (NewOpc == TargetOpcode::G_CTTZ) {
2135       // The count is the same in the larger type except if the original
2136       // value was zero.  This can be handled by setting the bit just off
2137       // the top of the original type.
2138       auto TopBit =
2139           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2140       MIBSrc = MIRBuilder.buildOr(
2141         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2142       // Now we know the operand is non-zero, use the more relaxed opcode.
2143       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2144     }
2145 
2146     // Perform the operation at the larger size.
2147     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2148     // This is already the correct result for CTPOP and CTTZs
2149     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2150         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2151       // The correct result is NewOp - (Difference in widety and current ty).
2152       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2153       MIBNewOp = MIRBuilder.buildSub(
2154           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2155     }
2156 
2157     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2158     MI.eraseFromParent();
2159     return Legalized;
2160   }
2161   case TargetOpcode::G_BSWAP: {
2162     Observer.changingInstr(MI);
2163     Register DstReg = MI.getOperand(0).getReg();
2164 
2165     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2166     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2167     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2168     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2169 
2170     MI.getOperand(0).setReg(DstExt);
2171 
2172     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2173 
2174     LLT Ty = MRI.getType(DstReg);
2175     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2176     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2177     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2178 
2179     MIRBuilder.buildTrunc(DstReg, ShrReg);
2180     Observer.changedInstr(MI);
2181     return Legalized;
2182   }
2183   case TargetOpcode::G_BITREVERSE: {
2184     Observer.changingInstr(MI);
2185 
2186     Register DstReg = MI.getOperand(0).getReg();
2187     LLT Ty = MRI.getType(DstReg);
2188     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2189 
2190     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2191     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2192     MI.getOperand(0).setReg(DstExt);
2193     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2194 
2195     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2196     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2197     MIRBuilder.buildTrunc(DstReg, Shift);
2198     Observer.changedInstr(MI);
2199     return Legalized;
2200   }
2201   case TargetOpcode::G_FREEZE:
2202     Observer.changingInstr(MI);
2203     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2204     widenScalarDst(MI, WideTy);
2205     Observer.changedInstr(MI);
2206     return Legalized;
2207 
2208   case TargetOpcode::G_ABS:
2209     Observer.changingInstr(MI);
2210     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2211     widenScalarDst(MI, WideTy);
2212     Observer.changedInstr(MI);
2213     return Legalized;
2214 
2215   case TargetOpcode::G_ADD:
2216   case TargetOpcode::G_AND:
2217   case TargetOpcode::G_MUL:
2218   case TargetOpcode::G_OR:
2219   case TargetOpcode::G_XOR:
2220   case TargetOpcode::G_SUB:
2221     // Perform operation at larger width (any extension is fines here, high bits
2222     // don't affect the result) and then truncate the result back to the
2223     // original type.
2224     Observer.changingInstr(MI);
2225     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2226     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2227     widenScalarDst(MI, WideTy);
2228     Observer.changedInstr(MI);
2229     return Legalized;
2230 
2231   case TargetOpcode::G_SBFX:
2232   case TargetOpcode::G_UBFX:
2233     Observer.changingInstr(MI);
2234 
2235     if (TypeIdx == 0) {
2236       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2237       widenScalarDst(MI, WideTy);
2238     } else {
2239       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2240       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2241     }
2242 
2243     Observer.changedInstr(MI);
2244     return Legalized;
2245 
2246   case TargetOpcode::G_SHL:
2247     Observer.changingInstr(MI);
2248 
2249     if (TypeIdx == 0) {
2250       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2251       widenScalarDst(MI, WideTy);
2252     } else {
2253       assert(TypeIdx == 1);
2254       // The "number of bits to shift" operand must preserve its value as an
2255       // unsigned integer:
2256       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2257     }
2258 
2259     Observer.changedInstr(MI);
2260     return Legalized;
2261 
2262   case TargetOpcode::G_SDIV:
2263   case TargetOpcode::G_SREM:
2264   case TargetOpcode::G_SMIN:
2265   case TargetOpcode::G_SMAX:
2266     Observer.changingInstr(MI);
2267     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2268     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2269     widenScalarDst(MI, WideTy);
2270     Observer.changedInstr(MI);
2271     return Legalized;
2272 
2273   case TargetOpcode::G_SDIVREM:
2274     Observer.changingInstr(MI);
2275     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2276     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2277     widenScalarDst(MI, WideTy);
2278     widenScalarDst(MI, WideTy, 1);
2279     Observer.changedInstr(MI);
2280     return Legalized;
2281 
2282   case TargetOpcode::G_ASHR:
2283   case TargetOpcode::G_LSHR:
2284     Observer.changingInstr(MI);
2285 
2286     if (TypeIdx == 0) {
2287       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2288         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2289 
2290       widenScalarSrc(MI, WideTy, 1, CvtOp);
2291       widenScalarDst(MI, WideTy);
2292     } else {
2293       assert(TypeIdx == 1);
2294       // The "number of bits to shift" operand must preserve its value as an
2295       // unsigned integer:
2296       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2297     }
2298 
2299     Observer.changedInstr(MI);
2300     return Legalized;
2301   case TargetOpcode::G_UDIV:
2302   case TargetOpcode::G_UREM:
2303   case TargetOpcode::G_UMIN:
2304   case TargetOpcode::G_UMAX:
2305     Observer.changingInstr(MI);
2306     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2307     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2308     widenScalarDst(MI, WideTy);
2309     Observer.changedInstr(MI);
2310     return Legalized;
2311 
2312   case TargetOpcode::G_UDIVREM:
2313     Observer.changingInstr(MI);
2314     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2315     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2316     widenScalarDst(MI, WideTy);
2317     widenScalarDst(MI, WideTy, 1);
2318     Observer.changedInstr(MI);
2319     return Legalized;
2320 
2321   case TargetOpcode::G_SELECT:
2322     Observer.changingInstr(MI);
2323     if (TypeIdx == 0) {
2324       // Perform operation at larger width (any extension is fine here, high
2325       // bits don't affect the result) and then truncate the result back to the
2326       // original type.
2327       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2328       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2329       widenScalarDst(MI, WideTy);
2330     } else {
2331       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2332       // Explicit extension is required here since high bits affect the result.
2333       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2334     }
2335     Observer.changedInstr(MI);
2336     return Legalized;
2337 
2338   case TargetOpcode::G_FPTOSI:
2339   case TargetOpcode::G_FPTOUI:
2340     Observer.changingInstr(MI);
2341 
2342     if (TypeIdx == 0)
2343       widenScalarDst(MI, WideTy);
2344     else
2345       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2346 
2347     Observer.changedInstr(MI);
2348     return Legalized;
2349   case TargetOpcode::G_SITOFP:
2350     Observer.changingInstr(MI);
2351 
2352     if (TypeIdx == 0)
2353       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2354     else
2355       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2356 
2357     Observer.changedInstr(MI);
2358     return Legalized;
2359   case TargetOpcode::G_UITOFP:
2360     Observer.changingInstr(MI);
2361 
2362     if (TypeIdx == 0)
2363       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2364     else
2365       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2366 
2367     Observer.changedInstr(MI);
2368     return Legalized;
2369   case TargetOpcode::G_LOAD:
2370   case TargetOpcode::G_SEXTLOAD:
2371   case TargetOpcode::G_ZEXTLOAD:
2372     Observer.changingInstr(MI);
2373     widenScalarDst(MI, WideTy);
2374     Observer.changedInstr(MI);
2375     return Legalized;
2376 
2377   case TargetOpcode::G_STORE: {
2378     if (TypeIdx != 0)
2379       return UnableToLegalize;
2380 
2381     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2382     if (!Ty.isScalar())
2383       return UnableToLegalize;
2384 
2385     Observer.changingInstr(MI);
2386 
2387     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2388       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2389     widenScalarSrc(MI, WideTy, 0, ExtType);
2390 
2391     Observer.changedInstr(MI);
2392     return Legalized;
2393   }
2394   case TargetOpcode::G_CONSTANT: {
2395     MachineOperand &SrcMO = MI.getOperand(1);
2396     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2397     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2398         MRI.getType(MI.getOperand(0).getReg()));
2399     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2400             ExtOpc == TargetOpcode::G_ANYEXT) &&
2401            "Illegal Extend");
2402     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2403     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2404                            ? SrcVal.sext(WideTy.getSizeInBits())
2405                            : SrcVal.zext(WideTy.getSizeInBits());
2406     Observer.changingInstr(MI);
2407     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2408 
2409     widenScalarDst(MI, WideTy);
2410     Observer.changedInstr(MI);
2411     return Legalized;
2412   }
2413   case TargetOpcode::G_FCONSTANT: {
2414     MachineOperand &SrcMO = MI.getOperand(1);
2415     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2416     APFloat Val = SrcMO.getFPImm()->getValueAPF();
2417     bool LosesInfo;
2418     switch (WideTy.getSizeInBits()) {
2419     case 32:
2420       Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
2421                   &LosesInfo);
2422       break;
2423     case 64:
2424       Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
2425                   &LosesInfo);
2426       break;
2427     default:
2428       return UnableToLegalize;
2429     }
2430 
2431     assert(!LosesInfo && "extend should always be lossless");
2432 
2433     Observer.changingInstr(MI);
2434     SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
2435 
2436     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2437     Observer.changedInstr(MI);
2438     return Legalized;
2439   }
2440   case TargetOpcode::G_IMPLICIT_DEF: {
2441     Observer.changingInstr(MI);
2442     widenScalarDst(MI, WideTy);
2443     Observer.changedInstr(MI);
2444     return Legalized;
2445   }
2446   case TargetOpcode::G_BRCOND:
2447     Observer.changingInstr(MI);
2448     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2449     Observer.changedInstr(MI);
2450     return Legalized;
2451 
2452   case TargetOpcode::G_FCMP:
2453     Observer.changingInstr(MI);
2454     if (TypeIdx == 0)
2455       widenScalarDst(MI, WideTy);
2456     else {
2457       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2458       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2459     }
2460     Observer.changedInstr(MI);
2461     return Legalized;
2462 
2463   case TargetOpcode::G_ICMP:
2464     Observer.changingInstr(MI);
2465     if (TypeIdx == 0)
2466       widenScalarDst(MI, WideTy);
2467     else {
2468       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2469                                MI.getOperand(1).getPredicate()))
2470                                ? TargetOpcode::G_SEXT
2471                                : TargetOpcode::G_ZEXT;
2472       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2473       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2474     }
2475     Observer.changedInstr(MI);
2476     return Legalized;
2477 
2478   case TargetOpcode::G_PTR_ADD:
2479     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2480     Observer.changingInstr(MI);
2481     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2482     Observer.changedInstr(MI);
2483     return Legalized;
2484 
2485   case TargetOpcode::G_PHI: {
2486     assert(TypeIdx == 0 && "Expecting only Idx 0");
2487 
2488     Observer.changingInstr(MI);
2489     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2490       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2491       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
2492       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2493     }
2494 
2495     MachineBasicBlock &MBB = *MI.getParent();
2496     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2497     widenScalarDst(MI, WideTy);
2498     Observer.changedInstr(MI);
2499     return Legalized;
2500   }
2501   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2502     if (TypeIdx == 0) {
2503       Register VecReg = MI.getOperand(1).getReg();
2504       LLT VecTy = MRI.getType(VecReg);
2505       Observer.changingInstr(MI);
2506 
2507       widenScalarSrc(
2508           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2509           TargetOpcode::G_ANYEXT);
2510 
2511       widenScalarDst(MI, WideTy, 0);
2512       Observer.changedInstr(MI);
2513       return Legalized;
2514     }
2515 
2516     if (TypeIdx != 2)
2517       return UnableToLegalize;
2518     Observer.changingInstr(MI);
2519     // TODO: Probably should be zext
2520     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2521     Observer.changedInstr(MI);
2522     return Legalized;
2523   }
2524   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2525     if (TypeIdx == 1) {
2526       Observer.changingInstr(MI);
2527 
2528       Register VecReg = MI.getOperand(1).getReg();
2529       LLT VecTy = MRI.getType(VecReg);
2530       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2531 
2532       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2533       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2534       widenScalarDst(MI, WideVecTy, 0);
2535       Observer.changedInstr(MI);
2536       return Legalized;
2537     }
2538 
2539     if (TypeIdx == 2) {
2540       Observer.changingInstr(MI);
2541       // TODO: Probably should be zext
2542       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2543       Observer.changedInstr(MI);
2544       return Legalized;
2545     }
2546 
2547     return UnableToLegalize;
2548   }
2549   case TargetOpcode::G_FADD:
2550   case TargetOpcode::G_FMUL:
2551   case TargetOpcode::G_FSUB:
2552   case TargetOpcode::G_FMA:
2553   case TargetOpcode::G_FMAD:
2554   case TargetOpcode::G_FNEG:
2555   case TargetOpcode::G_FABS:
2556   case TargetOpcode::G_FCANONICALIZE:
2557   case TargetOpcode::G_FMINNUM:
2558   case TargetOpcode::G_FMAXNUM:
2559   case TargetOpcode::G_FMINNUM_IEEE:
2560   case TargetOpcode::G_FMAXNUM_IEEE:
2561   case TargetOpcode::G_FMINIMUM:
2562   case TargetOpcode::G_FMAXIMUM:
2563   case TargetOpcode::G_FDIV:
2564   case TargetOpcode::G_FREM:
2565   case TargetOpcode::G_FCEIL:
2566   case TargetOpcode::G_FFLOOR:
2567   case TargetOpcode::G_FCOS:
2568   case TargetOpcode::G_FSIN:
2569   case TargetOpcode::G_FLOG10:
2570   case TargetOpcode::G_FLOG:
2571   case TargetOpcode::G_FLOG2:
2572   case TargetOpcode::G_FRINT:
2573   case TargetOpcode::G_FNEARBYINT:
2574   case TargetOpcode::G_FSQRT:
2575   case TargetOpcode::G_FEXP:
2576   case TargetOpcode::G_FEXP2:
2577   case TargetOpcode::G_FPOW:
2578   case TargetOpcode::G_INTRINSIC_TRUNC:
2579   case TargetOpcode::G_INTRINSIC_ROUND:
2580   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2581     assert(TypeIdx == 0);
2582     Observer.changingInstr(MI);
2583 
2584     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2585       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2586 
2587     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2588     Observer.changedInstr(MI);
2589     return Legalized;
2590   case TargetOpcode::G_FPOWI: {
2591     if (TypeIdx != 0)
2592       return UnableToLegalize;
2593     Observer.changingInstr(MI);
2594     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2595     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2596     Observer.changedInstr(MI);
2597     return Legalized;
2598   }
2599   case TargetOpcode::G_INTTOPTR:
2600     if (TypeIdx != 1)
2601       return UnableToLegalize;
2602 
2603     Observer.changingInstr(MI);
2604     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2605     Observer.changedInstr(MI);
2606     return Legalized;
2607   case TargetOpcode::G_PTRTOINT:
2608     if (TypeIdx != 0)
2609       return UnableToLegalize;
2610 
2611     Observer.changingInstr(MI);
2612     widenScalarDst(MI, WideTy, 0);
2613     Observer.changedInstr(MI);
2614     return Legalized;
2615   case TargetOpcode::G_BUILD_VECTOR: {
2616     Observer.changingInstr(MI);
2617 
2618     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2619     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2620       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2621 
2622     // Avoid changing the result vector type if the source element type was
2623     // requested.
2624     if (TypeIdx == 1) {
2625       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2626     } else {
2627       widenScalarDst(MI, WideTy, 0);
2628     }
2629 
2630     Observer.changedInstr(MI);
2631     return Legalized;
2632   }
2633   case TargetOpcode::G_SEXT_INREG:
2634     if (TypeIdx != 0)
2635       return UnableToLegalize;
2636 
2637     Observer.changingInstr(MI);
2638     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2639     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2640     Observer.changedInstr(MI);
2641     return Legalized;
2642   case TargetOpcode::G_PTRMASK: {
2643     if (TypeIdx != 1)
2644       return UnableToLegalize;
2645     Observer.changingInstr(MI);
2646     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2647     Observer.changedInstr(MI);
2648     return Legalized;
2649   }
2650   }
2651 }
2652 
2653 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2654                              MachineIRBuilder &B, Register Src, LLT Ty) {
2655   auto Unmerge = B.buildUnmerge(Ty, Src);
2656   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2657     Pieces.push_back(Unmerge.getReg(I));
2658 }
2659 
2660 LegalizerHelper::LegalizeResult
2661 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2662   Register Dst = MI.getOperand(0).getReg();
2663   Register Src = MI.getOperand(1).getReg();
2664   LLT DstTy = MRI.getType(Dst);
2665   LLT SrcTy = MRI.getType(Src);
2666 
2667   if (SrcTy.isVector()) {
2668     LLT SrcEltTy = SrcTy.getElementType();
2669     SmallVector<Register, 8> SrcRegs;
2670 
2671     if (DstTy.isVector()) {
2672       int NumDstElt = DstTy.getNumElements();
2673       int NumSrcElt = SrcTy.getNumElements();
2674 
2675       LLT DstEltTy = DstTy.getElementType();
2676       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2677       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2678 
2679       // If there's an element size mismatch, insert intermediate casts to match
2680       // the result element type.
2681       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2682         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2683         //
2684         // =>
2685         //
2686         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2687         // %3:_(<2 x s8>) = G_BITCAST %2
2688         // %4:_(<2 x s8>) = G_BITCAST %3
2689         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2690         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2691         SrcPartTy = SrcEltTy;
2692       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2693         //
2694         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2695         //
2696         // =>
2697         //
2698         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2699         // %3:_(s16) = G_BITCAST %2
2700         // %4:_(s16) = G_BITCAST %3
2701         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2702         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2703         DstCastTy = DstEltTy;
2704       }
2705 
2706       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2707       for (Register &SrcReg : SrcRegs)
2708         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2709     } else
2710       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2711 
2712     MIRBuilder.buildMerge(Dst, SrcRegs);
2713     MI.eraseFromParent();
2714     return Legalized;
2715   }
2716 
2717   if (DstTy.isVector()) {
2718     SmallVector<Register, 8> SrcRegs;
2719     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2720     MIRBuilder.buildMerge(Dst, SrcRegs);
2721     MI.eraseFromParent();
2722     return Legalized;
2723   }
2724 
2725   return UnableToLegalize;
2726 }
2727 
2728 /// Figure out the bit offset into a register when coercing a vector index for
2729 /// the wide element type. This is only for the case when promoting vector to
2730 /// one with larger elements.
2731 //
2732 ///
2733 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2734 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2735 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2736                                                    Register Idx,
2737                                                    unsigned NewEltSize,
2738                                                    unsigned OldEltSize) {
2739   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2740   LLT IdxTy = B.getMRI()->getType(Idx);
2741 
2742   // Now figure out the amount we need to shift to get the target bits.
2743   auto OffsetMask = B.buildConstant(
2744       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2745   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2746   return B.buildShl(IdxTy, OffsetIdx,
2747                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2748 }
2749 
2750 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2751 /// is casting to a vector with a smaller element size, perform multiple element
2752 /// extracts and merge the results. If this is coercing to a vector with larger
2753 /// elements, index the bitcasted vector and extract the target element with bit
2754 /// operations. This is intended to force the indexing in the native register
2755 /// size for architectures that can dynamically index the register file.
2756 LegalizerHelper::LegalizeResult
2757 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2758                                          LLT CastTy) {
2759   if (TypeIdx != 1)
2760     return UnableToLegalize;
2761 
2762   Register Dst = MI.getOperand(0).getReg();
2763   Register SrcVec = MI.getOperand(1).getReg();
2764   Register Idx = MI.getOperand(2).getReg();
2765   LLT SrcVecTy = MRI.getType(SrcVec);
2766   LLT IdxTy = MRI.getType(Idx);
2767 
2768   LLT SrcEltTy = SrcVecTy.getElementType();
2769   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2770   unsigned OldNumElts = SrcVecTy.getNumElements();
2771 
2772   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2773   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2774 
2775   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2776   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2777   if (NewNumElts > OldNumElts) {
2778     // Decreasing the vector element size
2779     //
2780     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2781     //  =>
2782     //  v4i32:castx = bitcast x:v2i64
2783     //
2784     // i64 = bitcast
2785     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2786     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2787     //
2788     if (NewNumElts % OldNumElts != 0)
2789       return UnableToLegalize;
2790 
2791     // Type of the intermediate result vector.
2792     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2793     LLT MidTy =
2794         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2795 
2796     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2797 
2798     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2799     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2800 
2801     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2802       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2803       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2804       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2805       NewOps[I] = Elt.getReg(0);
2806     }
2807 
2808     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2809     MIRBuilder.buildBitcast(Dst, NewVec);
2810     MI.eraseFromParent();
2811     return Legalized;
2812   }
2813 
2814   if (NewNumElts < OldNumElts) {
2815     if (NewEltSize % OldEltSize != 0)
2816       return UnableToLegalize;
2817 
2818     // This only depends on powers of 2 because we use bit tricks to figure out
2819     // the bit offset we need to shift to get the target element. A general
2820     // expansion could emit division/multiply.
2821     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2822       return UnableToLegalize;
2823 
2824     // Increasing the vector element size.
2825     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2826     //
2827     //   =>
2828     //
2829     // %cast = G_BITCAST %vec
2830     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2831     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2832     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2833     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2834     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2835     // %elt = G_TRUNC %elt_bits
2836 
2837     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2838     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2839 
2840     // Divide to get the index in the wider element type.
2841     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2842 
2843     Register WideElt = CastVec;
2844     if (CastTy.isVector()) {
2845       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2846                                                      ScaledIdx).getReg(0);
2847     }
2848 
2849     // Compute the bit offset into the register of the target element.
2850     Register OffsetBits = getBitcastWiderVectorElementOffset(
2851       MIRBuilder, Idx, NewEltSize, OldEltSize);
2852 
2853     // Shift the wide element to get the target element.
2854     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2855     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2856     MI.eraseFromParent();
2857     return Legalized;
2858   }
2859 
2860   return UnableToLegalize;
2861 }
2862 
2863 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2864 /// TargetReg, while preserving other bits in \p TargetReg.
2865 ///
2866 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2867 static Register buildBitFieldInsert(MachineIRBuilder &B,
2868                                     Register TargetReg, Register InsertReg,
2869                                     Register OffsetBits) {
2870   LLT TargetTy = B.getMRI()->getType(TargetReg);
2871   LLT InsertTy = B.getMRI()->getType(InsertReg);
2872   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2873   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2874 
2875   // Produce a bitmask of the value to insert
2876   auto EltMask = B.buildConstant(
2877     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2878                                    InsertTy.getSizeInBits()));
2879   // Shift it into position
2880   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2881   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2882 
2883   // Clear out the bits in the wide element
2884   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2885 
2886   // The value to insert has all zeros already, so stick it into the masked
2887   // wide element.
2888   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2889 }
2890 
2891 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2892 /// is increasing the element size, perform the indexing in the target element
2893 /// type, and use bit operations to insert at the element position. This is
2894 /// intended for architectures that can dynamically index the register file and
2895 /// want to force indexing in the native register size.
2896 LegalizerHelper::LegalizeResult
2897 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2898                                         LLT CastTy) {
2899   if (TypeIdx != 0)
2900     return UnableToLegalize;
2901 
2902   Register Dst = MI.getOperand(0).getReg();
2903   Register SrcVec = MI.getOperand(1).getReg();
2904   Register Val = MI.getOperand(2).getReg();
2905   Register Idx = MI.getOperand(3).getReg();
2906 
2907   LLT VecTy = MRI.getType(Dst);
2908   LLT IdxTy = MRI.getType(Idx);
2909 
2910   LLT VecEltTy = VecTy.getElementType();
2911   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2912   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2913   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2914 
2915   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2916   unsigned OldNumElts = VecTy.getNumElements();
2917 
2918   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2919   if (NewNumElts < OldNumElts) {
2920     if (NewEltSize % OldEltSize != 0)
2921       return UnableToLegalize;
2922 
2923     // This only depends on powers of 2 because we use bit tricks to figure out
2924     // the bit offset we need to shift to get the target element. A general
2925     // expansion could emit division/multiply.
2926     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2927       return UnableToLegalize;
2928 
2929     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2930     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2931 
2932     // Divide to get the index in the wider element type.
2933     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2934 
2935     Register ExtractedElt = CastVec;
2936     if (CastTy.isVector()) {
2937       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2938                                                           ScaledIdx).getReg(0);
2939     }
2940 
2941     // Compute the bit offset into the register of the target element.
2942     Register OffsetBits = getBitcastWiderVectorElementOffset(
2943       MIRBuilder, Idx, NewEltSize, OldEltSize);
2944 
2945     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2946                                                Val, OffsetBits);
2947     if (CastTy.isVector()) {
2948       InsertedElt = MIRBuilder.buildInsertVectorElement(
2949         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2950     }
2951 
2952     MIRBuilder.buildBitcast(Dst, InsertedElt);
2953     MI.eraseFromParent();
2954     return Legalized;
2955   }
2956 
2957   return UnableToLegalize;
2958 }
2959 
2960 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2961   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2962   Register DstReg = LoadMI.getDstReg();
2963   Register PtrReg = LoadMI.getPointerReg();
2964   LLT DstTy = MRI.getType(DstReg);
2965   MachineMemOperand &MMO = LoadMI.getMMO();
2966   LLT MemTy = MMO.getMemoryType();
2967   MachineFunction &MF = MIRBuilder.getMF();
2968 
2969   unsigned MemSizeInBits = MemTy.getSizeInBits();
2970   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2971 
2972   if (MemSizeInBits != MemStoreSizeInBits) {
2973     if (MemTy.isVector())
2974       return UnableToLegalize;
2975 
2976     // Promote to a byte-sized load if not loading an integral number of
2977     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2978     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2979     MachineMemOperand *NewMMO =
2980         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2981 
2982     Register LoadReg = DstReg;
2983     LLT LoadTy = DstTy;
2984 
2985     // If this wasn't already an extending load, we need to widen the result
2986     // register to avoid creating a load with a narrower result than the source.
2987     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
2988       LoadTy = WideMemTy;
2989       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
2990     }
2991 
2992     if (isa<GSExtLoad>(LoadMI)) {
2993       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2994       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
2995     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == DstTy) {
2996       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
2997       // The extra bits are guaranteed to be zero, since we stored them that
2998       // way.  A zext load from Wide thus automatically gives zext from MemVT.
2999       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3000     } else {
3001       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3002     }
3003 
3004     if (DstTy != LoadTy)
3005       MIRBuilder.buildTrunc(DstReg, LoadReg);
3006 
3007     LoadMI.eraseFromParent();
3008     return Legalized;
3009   }
3010 
3011   // Big endian lowering not implemented.
3012   if (MIRBuilder.getDataLayout().isBigEndian())
3013     return UnableToLegalize;
3014 
3015   // This load needs splitting into power of 2 sized loads.
3016   //
3017   // Our strategy here is to generate anyextending loads for the smaller
3018   // types up to next power-2 result type, and then combine the two larger
3019   // result values together, before truncating back down to the non-pow-2
3020   // type.
3021   // E.g. v1 = i24 load =>
3022   // v2 = i32 zextload (2 byte)
3023   // v3 = i32 load (1 byte)
3024   // v4 = i32 shl v3, 16
3025   // v5 = i32 or v4, v2
3026   // v1 = i24 trunc v5
3027   // By doing this we generate the correct truncate which should get
3028   // combined away as an artifact with a matching extend.
3029 
3030   uint64_t LargeSplitSize, SmallSplitSize;
3031 
3032   if (!isPowerOf2_32(MemSizeInBits)) {
3033     // This load needs splitting into power of 2 sized loads.
3034     LargeSplitSize = PowerOf2Floor(MemSizeInBits);
3035     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3036   } else {
3037     // This is already a power of 2, but we still need to split this in half.
3038     //
3039     // Assume we're being asked to decompose an unaligned load.
3040     // TODO: If this requires multiple splits, handle them all at once.
3041     auto &Ctx = MF.getFunction().getContext();
3042     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3043       return UnableToLegalize;
3044 
3045     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3046   }
3047 
3048   if (MemTy.isVector()) {
3049     // TODO: Handle vector extloads
3050     if (MemTy != DstTy)
3051       return UnableToLegalize;
3052 
3053     // TODO: We can do better than scalarizing the vector and at least split it
3054     // in half.
3055     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3056   }
3057 
3058   MachineMemOperand *LargeMMO =
3059       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3060   MachineMemOperand *SmallMMO =
3061       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3062 
3063   LLT PtrTy = MRI.getType(PtrReg);
3064   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3065   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3066   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3067                                              PtrReg, *LargeMMO);
3068 
3069   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3070                                             LargeSplitSize / 8);
3071   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3072   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3073   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3074                                              SmallPtr, *SmallMMO);
3075 
3076   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3077   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3078 
3079   if (AnyExtTy == DstTy)
3080     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3081   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3082     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3083     MIRBuilder.buildTrunc(DstReg, {Or});
3084   } else {
3085     assert(DstTy.isPointer() && "expected pointer");
3086     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3087 
3088     // FIXME: We currently consider this to be illegal for non-integral address
3089     // spaces, but we need still need a way to reinterpret the bits.
3090     MIRBuilder.buildIntToPtr(DstReg, Or);
3091   }
3092 
3093   LoadMI.eraseFromParent();
3094   return Legalized;
3095 }
3096 
3097 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3098   // Lower a non-power of 2 store into multiple pow-2 stores.
3099   // E.g. split an i24 store into an i16 store + i8 store.
3100   // We do this by first extending the stored value to the next largest power
3101   // of 2 type, and then using truncating stores to store the components.
3102   // By doing this, likewise with G_LOAD, generate an extend that can be
3103   // artifact-combined away instead of leaving behind extracts.
3104   Register SrcReg = StoreMI.getValueReg();
3105   Register PtrReg = StoreMI.getPointerReg();
3106   LLT SrcTy = MRI.getType(SrcReg);
3107   MachineFunction &MF = MIRBuilder.getMF();
3108   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3109   LLT MemTy = MMO.getMemoryType();
3110 
3111   unsigned StoreWidth = MemTy.getSizeInBits();
3112   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3113 
3114   if (StoreWidth != StoreSizeInBits) {
3115     if (SrcTy.isVector())
3116       return UnableToLegalize;
3117 
3118     // Promote to a byte-sized store with upper bits zero if not
3119     // storing an integral number of bytes.  For example, promote
3120     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3121     LLT WideTy = LLT::scalar(StoreSizeInBits);
3122 
3123     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3124       // Avoid creating a store with a narrower source than result.
3125       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3126       SrcTy = WideTy;
3127     }
3128 
3129     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3130 
3131     MachineMemOperand *NewMMO =
3132         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3133     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3134     StoreMI.eraseFromParent();
3135     return Legalized;
3136   }
3137 
3138   if (MemTy.isVector()) {
3139     // TODO: Handle vector trunc stores
3140     if (MemTy != SrcTy)
3141       return UnableToLegalize;
3142 
3143     // TODO: We can do better than scalarizing the vector and at least split it
3144     // in half.
3145     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3146   }
3147 
3148   unsigned MemSizeInBits = MemTy.getSizeInBits();
3149   uint64_t LargeSplitSize, SmallSplitSize;
3150 
3151   if (!isPowerOf2_32(MemSizeInBits)) {
3152     LargeSplitSize = PowerOf2Floor(MemTy.getSizeInBits());
3153     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3154   } else {
3155     auto &Ctx = MF.getFunction().getContext();
3156     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3157       return UnableToLegalize; // Don't know what we're being asked to do.
3158 
3159     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3160   }
3161 
3162   // Extend to the next pow-2. If this store was itself the result of lowering,
3163   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3164   // that's wider than the stored size.
3165   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3166   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3167 
3168   if (SrcTy.isPointer()) {
3169     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3170     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3171   }
3172 
3173   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3174 
3175   // Obtain the smaller value by shifting away the larger value.
3176   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3177   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3178 
3179   // Generate the PtrAdd and truncating stores.
3180   LLT PtrTy = MRI.getType(PtrReg);
3181   auto OffsetCst = MIRBuilder.buildConstant(
3182     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3183   auto SmallPtr =
3184     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3185 
3186   MachineMemOperand *LargeMMO =
3187     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3188   MachineMemOperand *SmallMMO =
3189     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3190   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3191   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3192   StoreMI.eraseFromParent();
3193   return Legalized;
3194 }
3195 
3196 LegalizerHelper::LegalizeResult
3197 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3198   switch (MI.getOpcode()) {
3199   case TargetOpcode::G_LOAD: {
3200     if (TypeIdx != 0)
3201       return UnableToLegalize;
3202     MachineMemOperand &MMO = **MI.memoperands_begin();
3203 
3204     // Not sure how to interpret a bitcast of an extending load.
3205     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3206       return UnableToLegalize;
3207 
3208     Observer.changingInstr(MI);
3209     bitcastDst(MI, CastTy, 0);
3210     MMO.setType(CastTy);
3211     Observer.changedInstr(MI);
3212     return Legalized;
3213   }
3214   case TargetOpcode::G_STORE: {
3215     if (TypeIdx != 0)
3216       return UnableToLegalize;
3217 
3218     MachineMemOperand &MMO = **MI.memoperands_begin();
3219 
3220     // Not sure how to interpret a bitcast of a truncating store.
3221     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3222       return UnableToLegalize;
3223 
3224     Observer.changingInstr(MI);
3225     bitcastSrc(MI, CastTy, 0);
3226     MMO.setType(CastTy);
3227     Observer.changedInstr(MI);
3228     return Legalized;
3229   }
3230   case TargetOpcode::G_SELECT: {
3231     if (TypeIdx != 0)
3232       return UnableToLegalize;
3233 
3234     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3235       LLVM_DEBUG(
3236           dbgs() << "bitcast action not implemented for vector select\n");
3237       return UnableToLegalize;
3238     }
3239 
3240     Observer.changingInstr(MI);
3241     bitcastSrc(MI, CastTy, 2);
3242     bitcastSrc(MI, CastTy, 3);
3243     bitcastDst(MI, CastTy, 0);
3244     Observer.changedInstr(MI);
3245     return Legalized;
3246   }
3247   case TargetOpcode::G_AND:
3248   case TargetOpcode::G_OR:
3249   case TargetOpcode::G_XOR: {
3250     Observer.changingInstr(MI);
3251     bitcastSrc(MI, CastTy, 1);
3252     bitcastSrc(MI, CastTy, 2);
3253     bitcastDst(MI, CastTy, 0);
3254     Observer.changedInstr(MI);
3255     return Legalized;
3256   }
3257   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3258     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3259   case TargetOpcode::G_INSERT_VECTOR_ELT:
3260     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3261   default:
3262     return UnableToLegalize;
3263   }
3264 }
3265 
3266 // Legalize an instruction by changing the opcode in place.
3267 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3268     Observer.changingInstr(MI);
3269     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3270     Observer.changedInstr(MI);
3271 }
3272 
3273 LegalizerHelper::LegalizeResult
3274 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3275   using namespace TargetOpcode;
3276 
3277   switch(MI.getOpcode()) {
3278   default:
3279     return UnableToLegalize;
3280   case TargetOpcode::G_BITCAST:
3281     return lowerBitcast(MI);
3282   case TargetOpcode::G_SREM:
3283   case TargetOpcode::G_UREM: {
3284     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3285     auto Quot =
3286         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3287                               {MI.getOperand(1), MI.getOperand(2)});
3288 
3289     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3290     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3291     MI.eraseFromParent();
3292     return Legalized;
3293   }
3294   case TargetOpcode::G_SADDO:
3295   case TargetOpcode::G_SSUBO:
3296     return lowerSADDO_SSUBO(MI);
3297   case TargetOpcode::G_UMULH:
3298   case TargetOpcode::G_SMULH:
3299     return lowerSMULH_UMULH(MI);
3300   case TargetOpcode::G_SMULO:
3301   case TargetOpcode::G_UMULO: {
3302     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3303     // result.
3304     Register Res = MI.getOperand(0).getReg();
3305     Register Overflow = MI.getOperand(1).getReg();
3306     Register LHS = MI.getOperand(2).getReg();
3307     Register RHS = MI.getOperand(3).getReg();
3308     LLT Ty = MRI.getType(Res);
3309 
3310     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3311                           ? TargetOpcode::G_SMULH
3312                           : TargetOpcode::G_UMULH;
3313 
3314     Observer.changingInstr(MI);
3315     const auto &TII = MIRBuilder.getTII();
3316     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3317     MI.RemoveOperand(1);
3318     Observer.changedInstr(MI);
3319 
3320     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3321     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3322 
3323     // Move insert point forward so we can use the Res register if needed.
3324     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3325 
3326     // For *signed* multiply, overflow is detected by checking:
3327     // (hi != (lo >> bitwidth-1))
3328     if (Opcode == TargetOpcode::G_SMULH) {
3329       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3330       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3331       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3332     } else {
3333       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3334     }
3335     return Legalized;
3336   }
3337   case TargetOpcode::G_FNEG: {
3338     Register Res = MI.getOperand(0).getReg();
3339     LLT Ty = MRI.getType(Res);
3340 
3341     // TODO: Handle vector types once we are able to
3342     // represent them.
3343     if (Ty.isVector())
3344       return UnableToLegalize;
3345     auto SignMask =
3346         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3347     Register SubByReg = MI.getOperand(1).getReg();
3348     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3349     MI.eraseFromParent();
3350     return Legalized;
3351   }
3352   case TargetOpcode::G_FSUB: {
3353     Register Res = MI.getOperand(0).getReg();
3354     LLT Ty = MRI.getType(Res);
3355 
3356     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3357     // First, check if G_FNEG is marked as Lower. If so, we may
3358     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
3359     if (LI.getAction({G_FNEG, {Ty}}).Action == Lower)
3360       return UnableToLegalize;
3361     Register LHS = MI.getOperand(1).getReg();
3362     Register RHS = MI.getOperand(2).getReg();
3363     Register Neg = MRI.createGenericVirtualRegister(Ty);
3364     MIRBuilder.buildFNeg(Neg, RHS);
3365     MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3366     MI.eraseFromParent();
3367     return Legalized;
3368   }
3369   case TargetOpcode::G_FMAD:
3370     return lowerFMad(MI);
3371   case TargetOpcode::G_FFLOOR:
3372     return lowerFFloor(MI);
3373   case TargetOpcode::G_INTRINSIC_ROUND:
3374     return lowerIntrinsicRound(MI);
3375   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3376     // Since round even is the assumed rounding mode for unconstrained FP
3377     // operations, rint and roundeven are the same operation.
3378     changeOpcode(MI, TargetOpcode::G_FRINT);
3379     return Legalized;
3380   }
3381   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3382     Register OldValRes = MI.getOperand(0).getReg();
3383     Register SuccessRes = MI.getOperand(1).getReg();
3384     Register Addr = MI.getOperand(2).getReg();
3385     Register CmpVal = MI.getOperand(3).getReg();
3386     Register NewVal = MI.getOperand(4).getReg();
3387     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3388                                   **MI.memoperands_begin());
3389     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3390     MI.eraseFromParent();
3391     return Legalized;
3392   }
3393   case TargetOpcode::G_LOAD:
3394   case TargetOpcode::G_SEXTLOAD:
3395   case TargetOpcode::G_ZEXTLOAD:
3396     return lowerLoad(cast<GAnyLoad>(MI));
3397   case TargetOpcode::G_STORE:
3398     return lowerStore(cast<GStore>(MI));
3399   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3400   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3401   case TargetOpcode::G_CTLZ:
3402   case TargetOpcode::G_CTTZ:
3403   case TargetOpcode::G_CTPOP:
3404     return lowerBitCount(MI);
3405   case G_UADDO: {
3406     Register Res = MI.getOperand(0).getReg();
3407     Register CarryOut = MI.getOperand(1).getReg();
3408     Register LHS = MI.getOperand(2).getReg();
3409     Register RHS = MI.getOperand(3).getReg();
3410 
3411     MIRBuilder.buildAdd(Res, LHS, RHS);
3412     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3413 
3414     MI.eraseFromParent();
3415     return Legalized;
3416   }
3417   case G_UADDE: {
3418     Register Res = MI.getOperand(0).getReg();
3419     Register CarryOut = MI.getOperand(1).getReg();
3420     Register LHS = MI.getOperand(2).getReg();
3421     Register RHS = MI.getOperand(3).getReg();
3422     Register CarryIn = MI.getOperand(4).getReg();
3423     LLT Ty = MRI.getType(Res);
3424 
3425     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3426     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3427     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3428     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3429 
3430     MI.eraseFromParent();
3431     return Legalized;
3432   }
3433   case G_USUBO: {
3434     Register Res = MI.getOperand(0).getReg();
3435     Register BorrowOut = MI.getOperand(1).getReg();
3436     Register LHS = MI.getOperand(2).getReg();
3437     Register RHS = MI.getOperand(3).getReg();
3438 
3439     MIRBuilder.buildSub(Res, LHS, RHS);
3440     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3441 
3442     MI.eraseFromParent();
3443     return Legalized;
3444   }
3445   case G_USUBE: {
3446     Register Res = MI.getOperand(0).getReg();
3447     Register BorrowOut = MI.getOperand(1).getReg();
3448     Register LHS = MI.getOperand(2).getReg();
3449     Register RHS = MI.getOperand(3).getReg();
3450     Register BorrowIn = MI.getOperand(4).getReg();
3451     const LLT CondTy = MRI.getType(BorrowOut);
3452     const LLT Ty = MRI.getType(Res);
3453 
3454     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3455     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3456     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3457 
3458     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3459     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3460     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3461 
3462     MI.eraseFromParent();
3463     return Legalized;
3464   }
3465   case G_UITOFP:
3466     return lowerUITOFP(MI);
3467   case G_SITOFP:
3468     return lowerSITOFP(MI);
3469   case G_FPTOUI:
3470     return lowerFPTOUI(MI);
3471   case G_FPTOSI:
3472     return lowerFPTOSI(MI);
3473   case G_FPTRUNC:
3474     return lowerFPTRUNC(MI);
3475   case G_FPOWI:
3476     return lowerFPOWI(MI);
3477   case G_SMIN:
3478   case G_SMAX:
3479   case G_UMIN:
3480   case G_UMAX:
3481     return lowerMinMax(MI);
3482   case G_FCOPYSIGN:
3483     return lowerFCopySign(MI);
3484   case G_FMINNUM:
3485   case G_FMAXNUM:
3486     return lowerFMinNumMaxNum(MI);
3487   case G_MERGE_VALUES:
3488     return lowerMergeValues(MI);
3489   case G_UNMERGE_VALUES:
3490     return lowerUnmergeValues(MI);
3491   case TargetOpcode::G_SEXT_INREG: {
3492     assert(MI.getOperand(2).isImm() && "Expected immediate");
3493     int64_t SizeInBits = MI.getOperand(2).getImm();
3494 
3495     Register DstReg = MI.getOperand(0).getReg();
3496     Register SrcReg = MI.getOperand(1).getReg();
3497     LLT DstTy = MRI.getType(DstReg);
3498     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3499 
3500     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3501     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3502     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3503     MI.eraseFromParent();
3504     return Legalized;
3505   }
3506   case G_EXTRACT_VECTOR_ELT:
3507   case G_INSERT_VECTOR_ELT:
3508     return lowerExtractInsertVectorElt(MI);
3509   case G_SHUFFLE_VECTOR:
3510     return lowerShuffleVector(MI);
3511   case G_DYN_STACKALLOC:
3512     return lowerDynStackAlloc(MI);
3513   case G_EXTRACT:
3514     return lowerExtract(MI);
3515   case G_INSERT:
3516     return lowerInsert(MI);
3517   case G_BSWAP:
3518     return lowerBswap(MI);
3519   case G_BITREVERSE:
3520     return lowerBitreverse(MI);
3521   case G_READ_REGISTER:
3522   case G_WRITE_REGISTER:
3523     return lowerReadWriteRegister(MI);
3524   case G_UADDSAT:
3525   case G_USUBSAT: {
3526     // Try to make a reasonable guess about which lowering strategy to use. The
3527     // target can override this with custom lowering and calling the
3528     // implementation functions.
3529     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3530     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3531       return lowerAddSubSatToMinMax(MI);
3532     return lowerAddSubSatToAddoSubo(MI);
3533   }
3534   case G_SADDSAT:
3535   case G_SSUBSAT: {
3536     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3537 
3538     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3539     // since it's a shorter expansion. However, we would need to figure out the
3540     // preferred boolean type for the carry out for the query.
3541     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3542       return lowerAddSubSatToMinMax(MI);
3543     return lowerAddSubSatToAddoSubo(MI);
3544   }
3545   case G_SSHLSAT:
3546   case G_USHLSAT:
3547     return lowerShlSat(MI);
3548   case G_ABS:
3549     return lowerAbsToAddXor(MI);
3550   case G_SELECT:
3551     return lowerSelect(MI);
3552   case G_SDIVREM:
3553   case G_UDIVREM:
3554     return lowerDIVREM(MI);
3555   case G_FSHL:
3556   case G_FSHR:
3557     return lowerFunnelShift(MI);
3558   case G_ROTL:
3559   case G_ROTR:
3560     return lowerRotate(MI);
3561   case G_MEMSET:
3562   case G_MEMCPY:
3563   case G_MEMMOVE:
3564     return lowerMemCpyFamily(MI);
3565   case G_MEMCPY_INLINE:
3566     return lowerMemcpyInline(MI);
3567   GISEL_VECREDUCE_CASES_NONSEQ
3568     return lowerVectorReduction(MI);
3569   }
3570 }
3571 
3572 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3573                                                   Align MinAlign) const {
3574   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3575   // datalayout for the preferred alignment. Also there should be a target hook
3576   // for this to allow targets to reduce the alignment and ignore the
3577   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3578   // the type.
3579   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3580 }
3581 
3582 MachineInstrBuilder
3583 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3584                                       MachinePointerInfo &PtrInfo) {
3585   MachineFunction &MF = MIRBuilder.getMF();
3586   const DataLayout &DL = MIRBuilder.getDataLayout();
3587   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3588 
3589   unsigned AddrSpace = DL.getAllocaAddrSpace();
3590   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3591 
3592   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3593   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3594 }
3595 
3596 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3597                                         LLT VecTy) {
3598   int64_t IdxVal;
3599   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3600     return IdxReg;
3601 
3602   LLT IdxTy = B.getMRI()->getType(IdxReg);
3603   unsigned NElts = VecTy.getNumElements();
3604   if (isPowerOf2_32(NElts)) {
3605     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3606     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3607   }
3608 
3609   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3610       .getReg(0);
3611 }
3612 
3613 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3614                                                   Register Index) {
3615   LLT EltTy = VecTy.getElementType();
3616 
3617   // Calculate the element offset and add it to the pointer.
3618   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3619   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3620          "Converting bits to bytes lost precision");
3621 
3622   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3623 
3624   LLT IdxTy = MRI.getType(Index);
3625   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3626                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3627 
3628   LLT PtrTy = MRI.getType(VecPtr);
3629   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3630 }
3631 
3632 #ifndef NDEBUG
3633 /// Check that all vector operands have same number of elements. Other operands
3634 /// should be listed in NonVecOp.
3635 static bool hasSameNumEltsOnAllVectorOperands(
3636     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3637     std::initializer_list<unsigned> NonVecOpIndices) {
3638   if (MI.getNumMemOperands() != 0)
3639     return false;
3640 
3641   LLT VecTy = MRI.getType(MI.getReg(0));
3642   if (!VecTy.isVector())
3643     return false;
3644   unsigned NumElts = VecTy.getNumElements();
3645 
3646   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3647     MachineOperand &Op = MI.getOperand(OpIdx);
3648     if (!Op.isReg()) {
3649       if (!is_contained(NonVecOpIndices, OpIdx))
3650         return false;
3651       continue;
3652     }
3653 
3654     LLT Ty = MRI.getType(Op.getReg());
3655     if (!Ty.isVector()) {
3656       if (!is_contained(NonVecOpIndices, OpIdx))
3657         return false;
3658       continue;
3659     }
3660 
3661     if (Ty.getNumElements() != NumElts)
3662       return false;
3663   }
3664 
3665   return true;
3666 }
3667 #endif
3668 
3669 /// Fill \p DstOps with DstOps that have same number of elements combined as
3670 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3671 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3672 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
3673 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3674                        unsigned NumElts) {
3675   LLT LeftoverTy;
3676   assert(Ty.isVector() && "Expected vector type");
3677   LLT EltTy = Ty.getElementType();
3678   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3679   int NumParts, NumLeftover;
3680   std::tie(NumParts, NumLeftover) =
3681       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3682 
3683   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3684   for (int i = 0; i < NumParts; ++i) {
3685     DstOps.push_back(NarrowTy);
3686   }
3687 
3688   if (LeftoverTy.isValid()) {
3689     assert(NumLeftover == 1 && "expected exactly one leftover");
3690     DstOps.push_back(LeftoverTy);
3691   }
3692 }
3693 
3694 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3695 /// made from \p Op depending on operand type.
3696 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3697                            MachineOperand &Op) {
3698   for (unsigned i = 0; i < N; ++i) {
3699     if (Op.isReg())
3700       Ops.push_back(Op.getReg());
3701     else if (Op.isImm())
3702       Ops.push_back(Op.getImm());
3703     else if (Op.isPredicate())
3704       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3705     else
3706       llvm_unreachable("Unsupported type");
3707   }
3708 }
3709 
3710 // Handle splitting vector operations which need to have the same number of
3711 // elements in each type index, but each type index may have a different element
3712 // type.
3713 //
3714 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3715 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3716 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3717 //
3718 // Also handles some irregular breakdown cases, e.g.
3719 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3720 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3721 //             s64 = G_SHL s64, s32
3722 LegalizerHelper::LegalizeResult
3723 LegalizerHelper::fewerElementsVectorMultiEltType(
3724     GenericMachineInstr &MI, unsigned NumElts,
3725     std::initializer_list<unsigned> NonVecOpIndices) {
3726   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3727          "Non-compatible opcode or not specified non-vector operands");
3728   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3729 
3730   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3731   unsigned NumDefs = MI.getNumDefs();
3732 
3733   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3734   // Build instructions with DstOps to use instruction found by CSE directly.
3735   // CSE copies found instruction into given vreg when building with vreg dest.
3736   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3737   // Output registers will be taken from created instructions.
3738   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3739   for (unsigned i = 0; i < NumDefs; ++i) {
3740     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3741   }
3742 
3743   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3744   // Operands listed in NonVecOpIndices will be used as is without splitting;
3745   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3746   // scalar condition (op 1), immediate in sext_inreg (op 2).
3747   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3748   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3749        ++UseIdx, ++UseNo) {
3750     if (is_contained(NonVecOpIndices, UseIdx)) {
3751       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3752                      MI.getOperand(UseIdx));
3753     } else {
3754       SmallVector<Register, 8> SplitPieces;
3755       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3756       for (auto Reg : SplitPieces)
3757         InputOpsPieces[UseNo].push_back(Reg);
3758     }
3759   }
3760 
3761   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3762 
3763   // Take i-th piece of each input operand split and build sub-vector/scalar
3764   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3765   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3766     SmallVector<DstOp, 2> Defs;
3767     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3768       Defs.push_back(OutputOpsPieces[DstNo][i]);
3769 
3770     SmallVector<SrcOp, 3> Uses;
3771     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3772       Uses.push_back(InputOpsPieces[InputNo][i]);
3773 
3774     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3775     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3776       OutputRegs[DstNo].push_back(I.getReg(DstNo));
3777   }
3778 
3779   // Merge small outputs into MI's output for each def operand.
3780   if (NumLeftovers) {
3781     for (unsigned i = 0; i < NumDefs; ++i)
3782       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3783   } else {
3784     for (unsigned i = 0; i < NumDefs; ++i)
3785       MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]);
3786   }
3787 
3788   MI.eraseFromParent();
3789   return Legalized;
3790 }
3791 
3792 LegalizerHelper::LegalizeResult
3793 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3794                                         unsigned NumElts) {
3795   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3796 
3797   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3798   unsigned NumDefs = MI.getNumDefs();
3799 
3800   SmallVector<DstOp, 8> OutputOpsPieces;
3801   SmallVector<Register, 8> OutputRegs;
3802   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3803 
3804   // Instructions that perform register split will be inserted in basic block
3805   // where register is defined (basic block is in the next operand).
3806   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3807   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3808        UseIdx += 2, ++UseNo) {
3809     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3810     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
3811     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3812   }
3813 
3814   // Build PHIs with fewer elements.
3815   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3816   MIRBuilder.setInsertPt(*MI.getParent(), MI);
3817   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3818     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3819     Phi.addDef(
3820         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3821     OutputRegs.push_back(Phi.getReg(0));
3822 
3823     for (unsigned j = 0; j < NumInputs / 2; ++j) {
3824       Phi.addUse(InputOpsPieces[j][i]);
3825       Phi.add(MI.getOperand(1 + j * 2 + 1));
3826     }
3827   }
3828 
3829   // Merge small outputs into MI's def.
3830   if (NumLeftovers) {
3831     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3832   } else {
3833     MIRBuilder.buildMerge(MI.getReg(0), OutputRegs);
3834   }
3835 
3836   MI.eraseFromParent();
3837   return Legalized;
3838 }
3839 
3840 LegalizerHelper::LegalizeResult
3841 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3842                                                   unsigned TypeIdx,
3843                                                   LLT NarrowTy) {
3844   const int NumDst = MI.getNumOperands() - 1;
3845   const Register SrcReg = MI.getOperand(NumDst).getReg();
3846   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3847   LLT SrcTy = MRI.getType(SrcReg);
3848 
3849   if (TypeIdx != 1 || NarrowTy == DstTy)
3850     return UnableToLegalize;
3851 
3852   // Requires compatible types. Otherwise SrcReg should have been defined by
3853   // merge-like instruction that would get artifact combined. Most likely
3854   // instruction that defines SrcReg has to perform more/fewer elements
3855   // legalization compatible with NarrowTy.
3856   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3857   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3858 
3859   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3860       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3861     return UnableToLegalize;
3862 
3863   // This is most likely DstTy (smaller then register size) packed in SrcTy
3864   // (larger then register size) and since unmerge was not combined it will be
3865   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3866   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3867 
3868   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3869   //
3870   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3871   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3872   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3873   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3874   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3875   const int PartsPerUnmerge = NumDst / NumUnmerge;
3876 
3877   for (int I = 0; I != NumUnmerge; ++I) {
3878     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3879 
3880     for (int J = 0; J != PartsPerUnmerge; ++J)
3881       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3882     MIB.addUse(Unmerge.getReg(I));
3883   }
3884 
3885   MI.eraseFromParent();
3886   return Legalized;
3887 }
3888 
3889 LegalizerHelper::LegalizeResult
3890 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3891                                           LLT NarrowTy) {
3892   Register DstReg = MI.getOperand(0).getReg();
3893   LLT DstTy = MRI.getType(DstReg);
3894   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
3895   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3896   // that should have been artifact combined. Most likely instruction that uses
3897   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3898   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3899   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3900   if (NarrowTy == SrcTy)
3901     return UnableToLegalize;
3902 
3903   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3904   // is for old mir tests. Since the changes to more/fewer elements it should no
3905   // longer be possible to generate MIR like this when starting from llvm-ir
3906   // because LCMTy approach was replaced with merge/unmerge to vector elements.
3907   if (TypeIdx == 1) {
3908     assert(SrcTy.isVector() && "Expected vector types");
3909     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3910     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3911         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3912       return UnableToLegalize;
3913     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3914     //
3915     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3916     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3917     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3918     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3919     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3920     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3921 
3922     SmallVector<Register, 8> Elts;
3923     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3924     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3925       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3926       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3927         Elts.push_back(Unmerge.getReg(j));
3928     }
3929 
3930     SmallVector<Register, 8> NarrowTyElts;
3931     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3932     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3933     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3934          ++i, Offset += NumNarrowTyElts) {
3935       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3936       NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0));
3937     }
3938 
3939     MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3940     MI.eraseFromParent();
3941     return Legalized;
3942   }
3943 
3944   assert(TypeIdx == 0 && "Bad type index");
3945   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3946       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3947     return UnableToLegalize;
3948 
3949   // This is most likely SrcTy (smaller then register size) packed in DstTy
3950   // (larger then register size) and since merge was not combined it will be
3951   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3952   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3953 
3954   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3955   //
3956   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3957   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3958   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3959   SmallVector<Register, 8> NarrowTyElts;
3960   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3961   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3962   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3963   for (unsigned i = 0; i < NumParts; ++i) {
3964     SmallVector<Register, 8> Sources;
3965     for (unsigned j = 0; j < NumElts; ++j)
3966       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3967     NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0));
3968   }
3969 
3970   MIRBuilder.buildMerge(DstReg, NarrowTyElts);
3971   MI.eraseFromParent();
3972   return Legalized;
3973 }
3974 
3975 LegalizerHelper::LegalizeResult
3976 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3977                                                            unsigned TypeIdx,
3978                                                            LLT NarrowVecTy) {
3979   Register DstReg = MI.getOperand(0).getReg();
3980   Register SrcVec = MI.getOperand(1).getReg();
3981   Register InsertVal;
3982   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3983 
3984   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3985   if (IsInsert)
3986     InsertVal = MI.getOperand(2).getReg();
3987 
3988   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3989 
3990   // TODO: Handle total scalarization case.
3991   if (!NarrowVecTy.isVector())
3992     return UnableToLegalize;
3993 
3994   LLT VecTy = MRI.getType(SrcVec);
3995 
3996   // If the index is a constant, we can really break this down as you would
3997   // expect, and index into the target size pieces.
3998   int64_t IdxVal;
3999   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4000   if (MaybeCst) {
4001     IdxVal = MaybeCst->Value.getSExtValue();
4002     // Avoid out of bounds indexing the pieces.
4003     if (IdxVal >= VecTy.getNumElements()) {
4004       MIRBuilder.buildUndef(DstReg);
4005       MI.eraseFromParent();
4006       return Legalized;
4007     }
4008 
4009     SmallVector<Register, 8> VecParts;
4010     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4011 
4012     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4013     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4014                                     TargetOpcode::G_ANYEXT);
4015 
4016     unsigned NewNumElts = NarrowVecTy.getNumElements();
4017 
4018     LLT IdxTy = MRI.getType(Idx);
4019     int64_t PartIdx = IdxVal / NewNumElts;
4020     auto NewIdx =
4021         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4022 
4023     if (IsInsert) {
4024       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4025 
4026       // Use the adjusted index to insert into one of the subvectors.
4027       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4028           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4029       VecParts[PartIdx] = InsertPart.getReg(0);
4030 
4031       // Recombine the inserted subvector with the others to reform the result
4032       // vector.
4033       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4034     } else {
4035       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4036     }
4037 
4038     MI.eraseFromParent();
4039     return Legalized;
4040   }
4041 
4042   // With a variable index, we can't perform the operation in a smaller type, so
4043   // we're forced to expand this.
4044   //
4045   // TODO: We could emit a chain of compare/select to figure out which piece to
4046   // index.
4047   return lowerExtractInsertVectorElt(MI);
4048 }
4049 
4050 LegalizerHelper::LegalizeResult
4051 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4052                                       LLT NarrowTy) {
4053   // FIXME: Don't know how to handle secondary types yet.
4054   if (TypeIdx != 0)
4055     return UnableToLegalize;
4056 
4057   // This implementation doesn't work for atomics. Give up instead of doing
4058   // something invalid.
4059   if (LdStMI.isAtomic())
4060     return UnableToLegalize;
4061 
4062   bool IsLoad = isa<GLoad>(LdStMI);
4063   Register ValReg = LdStMI.getReg(0);
4064   Register AddrReg = LdStMI.getPointerReg();
4065   LLT ValTy = MRI.getType(ValReg);
4066 
4067   // FIXME: Do we need a distinct NarrowMemory legalize action?
4068   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4069     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4070     return UnableToLegalize;
4071   }
4072 
4073   int NumParts = -1;
4074   int NumLeftover = -1;
4075   LLT LeftoverTy;
4076   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4077   if (IsLoad) {
4078     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4079   } else {
4080     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4081                      NarrowLeftoverRegs)) {
4082       NumParts = NarrowRegs.size();
4083       NumLeftover = NarrowLeftoverRegs.size();
4084     }
4085   }
4086 
4087   if (NumParts == -1)
4088     return UnableToLegalize;
4089 
4090   LLT PtrTy = MRI.getType(AddrReg);
4091   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4092 
4093   unsigned TotalSize = ValTy.getSizeInBits();
4094 
4095   // Split the load/store into PartTy sized pieces starting at Offset. If this
4096   // is a load, return the new registers in ValRegs. For a store, each elements
4097   // of ValRegs should be PartTy. Returns the next offset that needs to be
4098   // handled.
4099   auto MMO = LdStMI.getMMO();
4100   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4101                              unsigned Offset) -> unsigned {
4102     MachineFunction &MF = MIRBuilder.getMF();
4103     unsigned PartSize = PartTy.getSizeInBits();
4104     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4105          Offset += PartSize, ++Idx) {
4106       unsigned ByteOffset = Offset / 8;
4107       Register NewAddrReg;
4108 
4109       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4110 
4111       MachineMemOperand *NewMMO =
4112           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4113 
4114       if (IsLoad) {
4115         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4116         ValRegs.push_back(Dst);
4117         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4118       } else {
4119         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4120       }
4121     }
4122 
4123     return Offset;
4124   };
4125 
4126   unsigned HandledOffset = splitTypePieces(NarrowTy, NarrowRegs, 0);
4127 
4128   // Handle the rest of the register if this isn't an even type breakdown.
4129   if (LeftoverTy.isValid())
4130     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, HandledOffset);
4131 
4132   if (IsLoad) {
4133     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4134                 LeftoverTy, NarrowLeftoverRegs);
4135   }
4136 
4137   LdStMI.eraseFromParent();
4138   return Legalized;
4139 }
4140 
4141 LegalizerHelper::LegalizeResult
4142 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4143                                      LLT NarrowTy) {
4144   using namespace TargetOpcode;
4145   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4146   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4147 
4148   switch (MI.getOpcode()) {
4149   case G_IMPLICIT_DEF:
4150   case G_TRUNC:
4151   case G_AND:
4152   case G_OR:
4153   case G_XOR:
4154   case G_ADD:
4155   case G_SUB:
4156   case G_MUL:
4157   case G_PTR_ADD:
4158   case G_SMULH:
4159   case G_UMULH:
4160   case G_FADD:
4161   case G_FMUL:
4162   case G_FSUB:
4163   case G_FNEG:
4164   case G_FABS:
4165   case G_FCANONICALIZE:
4166   case G_FDIV:
4167   case G_FREM:
4168   case G_FMA:
4169   case G_FMAD:
4170   case G_FPOW:
4171   case G_FEXP:
4172   case G_FEXP2:
4173   case G_FLOG:
4174   case G_FLOG2:
4175   case G_FLOG10:
4176   case G_FNEARBYINT:
4177   case G_FCEIL:
4178   case G_FFLOOR:
4179   case G_FRINT:
4180   case G_INTRINSIC_ROUND:
4181   case G_INTRINSIC_ROUNDEVEN:
4182   case G_INTRINSIC_TRUNC:
4183   case G_FCOS:
4184   case G_FSIN:
4185   case G_FSQRT:
4186   case G_BSWAP:
4187   case G_BITREVERSE:
4188   case G_SDIV:
4189   case G_UDIV:
4190   case G_SREM:
4191   case G_UREM:
4192   case G_SDIVREM:
4193   case G_UDIVREM:
4194   case G_SMIN:
4195   case G_SMAX:
4196   case G_UMIN:
4197   case G_UMAX:
4198   case G_ABS:
4199   case G_FMINNUM:
4200   case G_FMAXNUM:
4201   case G_FMINNUM_IEEE:
4202   case G_FMAXNUM_IEEE:
4203   case G_FMINIMUM:
4204   case G_FMAXIMUM:
4205   case G_FSHL:
4206   case G_FSHR:
4207   case G_ROTL:
4208   case G_ROTR:
4209   case G_FREEZE:
4210   case G_SADDSAT:
4211   case G_SSUBSAT:
4212   case G_UADDSAT:
4213   case G_USUBSAT:
4214   case G_UMULO:
4215   case G_SMULO:
4216   case G_SHL:
4217   case G_LSHR:
4218   case G_ASHR:
4219   case G_SSHLSAT:
4220   case G_USHLSAT:
4221   case G_CTLZ:
4222   case G_CTLZ_ZERO_UNDEF:
4223   case G_CTTZ:
4224   case G_CTTZ_ZERO_UNDEF:
4225   case G_CTPOP:
4226   case G_FCOPYSIGN:
4227   case G_ZEXT:
4228   case G_SEXT:
4229   case G_ANYEXT:
4230   case G_FPEXT:
4231   case G_FPTRUNC:
4232   case G_SITOFP:
4233   case G_UITOFP:
4234   case G_FPTOSI:
4235   case G_FPTOUI:
4236   case G_INTTOPTR:
4237   case G_PTRTOINT:
4238   case G_ADDRSPACE_CAST:
4239     return fewerElementsVectorMultiEltType(GMI, NumElts);
4240   case G_ICMP:
4241   case G_FCMP:
4242     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4243   case G_SELECT:
4244     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4245       return fewerElementsVectorMultiEltType(GMI, NumElts);
4246     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4247   case G_PHI:
4248     return fewerElementsVectorPhi(GMI, NumElts);
4249   case G_UNMERGE_VALUES:
4250     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4251   case G_BUILD_VECTOR:
4252     assert(TypeIdx == 0 && "not a vector type index");
4253     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4254   case G_CONCAT_VECTORS:
4255     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4256       return UnableToLegalize;
4257     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4258   case G_EXTRACT_VECTOR_ELT:
4259   case G_INSERT_VECTOR_ELT:
4260     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4261   case G_LOAD:
4262   case G_STORE:
4263     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4264   case G_SEXT_INREG:
4265     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4266   GISEL_VECREDUCE_CASES_NONSEQ
4267     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4268   case G_SHUFFLE_VECTOR:
4269     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4270   default:
4271     return UnableToLegalize;
4272   }
4273 }
4274 
4275 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4276     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4277   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4278   if (TypeIdx != 0)
4279     return UnableToLegalize;
4280 
4281   Register DstReg = MI.getOperand(0).getReg();
4282   Register Src1Reg = MI.getOperand(1).getReg();
4283   Register Src2Reg = MI.getOperand(2).getReg();
4284   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4285   LLT DstTy = MRI.getType(DstReg);
4286   LLT Src1Ty = MRI.getType(Src1Reg);
4287   LLT Src2Ty = MRI.getType(Src2Reg);
4288   // The shuffle should be canonicalized by now.
4289   if (DstTy != Src1Ty)
4290     return UnableToLegalize;
4291   if (DstTy != Src2Ty)
4292     return UnableToLegalize;
4293 
4294   if (!isPowerOf2_32(DstTy.getNumElements()))
4295     return UnableToLegalize;
4296 
4297   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4298   // Further legalization attempts will be needed to do split further.
4299   NarrowTy =
4300       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4301   unsigned NewElts = NarrowTy.getNumElements();
4302 
4303   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4304   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4305   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4306   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4307                         SplitSrc2Regs[1]};
4308 
4309   Register Hi, Lo;
4310 
4311   // If Lo or Hi uses elements from at most two of the four input vectors, then
4312   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4313   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4314   SmallVector<int, 16> Ops;
4315   for (unsigned High = 0; High < 2; ++High) {
4316     Register &Output = High ? Hi : Lo;
4317 
4318     // Build a shuffle mask for the output, discovering on the fly which
4319     // input vectors to use as shuffle operands (recorded in InputUsed).
4320     // If building a suitable shuffle vector proves too hard, then bail
4321     // out with useBuildVector set.
4322     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4323     unsigned FirstMaskIdx = High * NewElts;
4324     bool UseBuildVector = false;
4325     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4326       // The mask element.  This indexes into the input.
4327       int Idx = Mask[FirstMaskIdx + MaskOffset];
4328 
4329       // The input vector this mask element indexes into.
4330       unsigned Input = (unsigned)Idx / NewElts;
4331 
4332       if (Input >= array_lengthof(Inputs)) {
4333         // The mask element does not index into any input vector.
4334         Ops.push_back(-1);
4335         continue;
4336       }
4337 
4338       // Turn the index into an offset from the start of the input vector.
4339       Idx -= Input * NewElts;
4340 
4341       // Find or create a shuffle vector operand to hold this input.
4342       unsigned OpNo;
4343       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
4344         if (InputUsed[OpNo] == Input) {
4345           // This input vector is already an operand.
4346           break;
4347         } else if (InputUsed[OpNo] == -1U) {
4348           // Create a new operand for this input vector.
4349           InputUsed[OpNo] = Input;
4350           break;
4351         }
4352       }
4353 
4354       if (OpNo >= array_lengthof(InputUsed)) {
4355         // More than two input vectors used!  Give up on trying to create a
4356         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4357         UseBuildVector = true;
4358         break;
4359       }
4360 
4361       // Add the mask index for the new shuffle vector.
4362       Ops.push_back(Idx + OpNo * NewElts);
4363     }
4364 
4365     if (UseBuildVector) {
4366       LLT EltTy = NarrowTy.getElementType();
4367       SmallVector<Register, 16> SVOps;
4368 
4369       // Extract the input elements by hand.
4370       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4371         // The mask element.  This indexes into the input.
4372         int Idx = Mask[FirstMaskIdx + MaskOffset];
4373 
4374         // The input vector this mask element indexes into.
4375         unsigned Input = (unsigned)Idx / NewElts;
4376 
4377         if (Input >= array_lengthof(Inputs)) {
4378           // The mask element is "undef" or indexes off the end of the input.
4379           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4380           continue;
4381         }
4382 
4383         // Turn the index into an offset from the start of the input vector.
4384         Idx -= Input * NewElts;
4385 
4386         // Extract the vector element by hand.
4387         SVOps.push_back(MIRBuilder
4388                             .buildExtractVectorElement(
4389                                 EltTy, Inputs[Input],
4390                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4391                             .getReg(0));
4392       }
4393 
4394       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4395       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4396     } else if (InputUsed[0] == -1U) {
4397       // No input vectors were used! The result is undefined.
4398       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4399     } else {
4400       Register Op0 = Inputs[InputUsed[0]];
4401       // If only one input was used, use an undefined vector for the other.
4402       Register Op1 = InputUsed[1] == -1U
4403                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4404                          : Inputs[InputUsed[1]];
4405       // At least one input vector was used. Create a new shuffle vector.
4406       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4407     }
4408 
4409     Ops.clear();
4410   }
4411 
4412   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4413   MI.eraseFromParent();
4414   return Legalized;
4415 }
4416 
4417 static unsigned getScalarOpcForReduction(unsigned Opc) {
4418   unsigned ScalarOpc;
4419   switch (Opc) {
4420   case TargetOpcode::G_VECREDUCE_FADD:
4421     ScalarOpc = TargetOpcode::G_FADD;
4422     break;
4423   case TargetOpcode::G_VECREDUCE_FMUL:
4424     ScalarOpc = TargetOpcode::G_FMUL;
4425     break;
4426   case TargetOpcode::G_VECREDUCE_FMAX:
4427     ScalarOpc = TargetOpcode::G_FMAXNUM;
4428     break;
4429   case TargetOpcode::G_VECREDUCE_FMIN:
4430     ScalarOpc = TargetOpcode::G_FMINNUM;
4431     break;
4432   case TargetOpcode::G_VECREDUCE_ADD:
4433     ScalarOpc = TargetOpcode::G_ADD;
4434     break;
4435   case TargetOpcode::G_VECREDUCE_MUL:
4436     ScalarOpc = TargetOpcode::G_MUL;
4437     break;
4438   case TargetOpcode::G_VECREDUCE_AND:
4439     ScalarOpc = TargetOpcode::G_AND;
4440     break;
4441   case TargetOpcode::G_VECREDUCE_OR:
4442     ScalarOpc = TargetOpcode::G_OR;
4443     break;
4444   case TargetOpcode::G_VECREDUCE_XOR:
4445     ScalarOpc = TargetOpcode::G_XOR;
4446     break;
4447   case TargetOpcode::G_VECREDUCE_SMAX:
4448     ScalarOpc = TargetOpcode::G_SMAX;
4449     break;
4450   case TargetOpcode::G_VECREDUCE_SMIN:
4451     ScalarOpc = TargetOpcode::G_SMIN;
4452     break;
4453   case TargetOpcode::G_VECREDUCE_UMAX:
4454     ScalarOpc = TargetOpcode::G_UMAX;
4455     break;
4456   case TargetOpcode::G_VECREDUCE_UMIN:
4457     ScalarOpc = TargetOpcode::G_UMIN;
4458     break;
4459   default:
4460     llvm_unreachable("Unhandled reduction");
4461   }
4462   return ScalarOpc;
4463 }
4464 
4465 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4466     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4467   unsigned Opc = MI.getOpcode();
4468   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4469          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4470          "Sequential reductions not expected");
4471 
4472   if (TypeIdx != 1)
4473     return UnableToLegalize;
4474 
4475   // The semantics of the normal non-sequential reductions allow us to freely
4476   // re-associate the operation.
4477   Register SrcReg = MI.getOperand(1).getReg();
4478   LLT SrcTy = MRI.getType(SrcReg);
4479   Register DstReg = MI.getOperand(0).getReg();
4480   LLT DstTy = MRI.getType(DstReg);
4481 
4482   if (NarrowTy.isVector() &&
4483       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4484     return UnableToLegalize;
4485 
4486   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4487   SmallVector<Register> SplitSrcs;
4488   // If NarrowTy is a scalar then we're being asked to scalarize.
4489   const unsigned NumParts =
4490       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4491                           : SrcTy.getNumElements();
4492 
4493   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4494   if (NarrowTy.isScalar()) {
4495     if (DstTy != NarrowTy)
4496       return UnableToLegalize; // FIXME: handle implicit extensions.
4497 
4498     if (isPowerOf2_32(NumParts)) {
4499       // Generate a tree of scalar operations to reduce the critical path.
4500       SmallVector<Register> PartialResults;
4501       unsigned NumPartsLeft = NumParts;
4502       while (NumPartsLeft > 1) {
4503         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4504           PartialResults.emplace_back(
4505               MIRBuilder
4506                   .buildInstr(ScalarOpc, {NarrowTy},
4507                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4508                   .getReg(0));
4509         }
4510         SplitSrcs = PartialResults;
4511         PartialResults.clear();
4512         NumPartsLeft = SplitSrcs.size();
4513       }
4514       assert(SplitSrcs.size() == 1);
4515       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4516       MI.eraseFromParent();
4517       return Legalized;
4518     }
4519     // If we can't generate a tree, then just do sequential operations.
4520     Register Acc = SplitSrcs[0];
4521     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4522       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4523                 .getReg(0);
4524     MIRBuilder.buildCopy(DstReg, Acc);
4525     MI.eraseFromParent();
4526     return Legalized;
4527   }
4528   SmallVector<Register> PartialReductions;
4529   for (unsigned Part = 0; Part < NumParts; ++Part) {
4530     PartialReductions.push_back(
4531         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4532   }
4533 
4534 
4535   // If the types involved are powers of 2, we can generate intermediate vector
4536   // ops, before generating a final reduction operation.
4537   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4538       isPowerOf2_32(NarrowTy.getNumElements())) {
4539     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4540   }
4541 
4542   Register Acc = PartialReductions[0];
4543   for (unsigned Part = 1; Part < NumParts; ++Part) {
4544     if (Part == NumParts - 1) {
4545       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4546                             {Acc, PartialReductions[Part]});
4547     } else {
4548       Acc = MIRBuilder
4549                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4550                 .getReg(0);
4551     }
4552   }
4553   MI.eraseFromParent();
4554   return Legalized;
4555 }
4556 
4557 LegalizerHelper::LegalizeResult
4558 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4559                                         LLT SrcTy, LLT NarrowTy,
4560                                         unsigned ScalarOpc) {
4561   SmallVector<Register> SplitSrcs;
4562   // Split the sources into NarrowTy size pieces.
4563   extractParts(SrcReg, NarrowTy,
4564                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4565   // We're going to do a tree reduction using vector operations until we have
4566   // one NarrowTy size value left.
4567   while (SplitSrcs.size() > 1) {
4568     SmallVector<Register> PartialRdxs;
4569     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4570       Register LHS = SplitSrcs[Idx];
4571       Register RHS = SplitSrcs[Idx + 1];
4572       // Create the intermediate vector op.
4573       Register Res =
4574           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4575       PartialRdxs.push_back(Res);
4576     }
4577     SplitSrcs = std::move(PartialRdxs);
4578   }
4579   // Finally generate the requested NarrowTy based reduction.
4580   Observer.changingInstr(MI);
4581   MI.getOperand(1).setReg(SplitSrcs[0]);
4582   Observer.changedInstr(MI);
4583   return Legalized;
4584 }
4585 
4586 LegalizerHelper::LegalizeResult
4587 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4588                                              const LLT HalfTy, const LLT AmtTy) {
4589 
4590   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4591   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4592   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4593 
4594   if (Amt.isZero()) {
4595     MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
4596     MI.eraseFromParent();
4597     return Legalized;
4598   }
4599 
4600   LLT NVT = HalfTy;
4601   unsigned NVTBits = HalfTy.getSizeInBits();
4602   unsigned VTBits = 2 * NVTBits;
4603 
4604   SrcOp Lo(Register(0)), Hi(Register(0));
4605   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4606     if (Amt.ugt(VTBits)) {
4607       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4608     } else if (Amt.ugt(NVTBits)) {
4609       Lo = MIRBuilder.buildConstant(NVT, 0);
4610       Hi = MIRBuilder.buildShl(NVT, InL,
4611                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4612     } else if (Amt == NVTBits) {
4613       Lo = MIRBuilder.buildConstant(NVT, 0);
4614       Hi = InL;
4615     } else {
4616       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4617       auto OrLHS =
4618           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4619       auto OrRHS = MIRBuilder.buildLShr(
4620           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4621       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4622     }
4623   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4624     if (Amt.ugt(VTBits)) {
4625       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4626     } else if (Amt.ugt(NVTBits)) {
4627       Lo = MIRBuilder.buildLShr(NVT, InH,
4628                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4629       Hi = MIRBuilder.buildConstant(NVT, 0);
4630     } else if (Amt == NVTBits) {
4631       Lo = InH;
4632       Hi = MIRBuilder.buildConstant(NVT, 0);
4633     } else {
4634       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4635 
4636       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4637       auto OrRHS = MIRBuilder.buildShl(
4638           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4639 
4640       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4641       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4642     }
4643   } else {
4644     if (Amt.ugt(VTBits)) {
4645       Hi = Lo = MIRBuilder.buildAShr(
4646           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4647     } else if (Amt.ugt(NVTBits)) {
4648       Lo = MIRBuilder.buildAShr(NVT, InH,
4649                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4650       Hi = MIRBuilder.buildAShr(NVT, InH,
4651                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4652     } else if (Amt == NVTBits) {
4653       Lo = InH;
4654       Hi = MIRBuilder.buildAShr(NVT, InH,
4655                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4656     } else {
4657       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4658 
4659       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4660       auto OrRHS = MIRBuilder.buildShl(
4661           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4662 
4663       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4664       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4665     }
4666   }
4667 
4668   MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
4669   MI.eraseFromParent();
4670 
4671   return Legalized;
4672 }
4673 
4674 // TODO: Optimize if constant shift amount.
4675 LegalizerHelper::LegalizeResult
4676 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4677                                    LLT RequestedTy) {
4678   if (TypeIdx == 1) {
4679     Observer.changingInstr(MI);
4680     narrowScalarSrc(MI, RequestedTy, 2);
4681     Observer.changedInstr(MI);
4682     return Legalized;
4683   }
4684 
4685   Register DstReg = MI.getOperand(0).getReg();
4686   LLT DstTy = MRI.getType(DstReg);
4687   if (DstTy.isVector())
4688     return UnableToLegalize;
4689 
4690   Register Amt = MI.getOperand(2).getReg();
4691   LLT ShiftAmtTy = MRI.getType(Amt);
4692   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4693   if (DstEltSize % 2 != 0)
4694     return UnableToLegalize;
4695 
4696   // Ignore the input type. We can only go to exactly half the size of the
4697   // input. If that isn't small enough, the resulting pieces will be further
4698   // legalized.
4699   const unsigned NewBitSize = DstEltSize / 2;
4700   const LLT HalfTy = LLT::scalar(NewBitSize);
4701   const LLT CondTy = LLT::scalar(1);
4702 
4703   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4704     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4705                                        ShiftAmtTy);
4706   }
4707 
4708   // TODO: Expand with known bits.
4709 
4710   // Handle the fully general expansion by an unknown amount.
4711   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4712 
4713   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4714   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4715   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4716 
4717   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4718   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4719 
4720   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4721   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4722   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4723 
4724   Register ResultRegs[2];
4725   switch (MI.getOpcode()) {
4726   case TargetOpcode::G_SHL: {
4727     // Short: ShAmt < NewBitSize
4728     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4729 
4730     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4731     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4732     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4733 
4734     // Long: ShAmt >= NewBitSize
4735     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4736     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4737 
4738     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4739     auto Hi = MIRBuilder.buildSelect(
4740         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4741 
4742     ResultRegs[0] = Lo.getReg(0);
4743     ResultRegs[1] = Hi.getReg(0);
4744     break;
4745   }
4746   case TargetOpcode::G_LSHR:
4747   case TargetOpcode::G_ASHR: {
4748     // Short: ShAmt < NewBitSize
4749     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4750 
4751     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4752     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4753     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4754 
4755     // Long: ShAmt >= NewBitSize
4756     MachineInstrBuilder HiL;
4757     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4758       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4759     } else {
4760       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4761       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4762     }
4763     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4764                                      {InH, AmtExcess});     // Lo from Hi part.
4765 
4766     auto Lo = MIRBuilder.buildSelect(
4767         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4768 
4769     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4770 
4771     ResultRegs[0] = Lo.getReg(0);
4772     ResultRegs[1] = Hi.getReg(0);
4773     break;
4774   }
4775   default:
4776     llvm_unreachable("not a shift");
4777   }
4778 
4779   MIRBuilder.buildMerge(DstReg, ResultRegs);
4780   MI.eraseFromParent();
4781   return Legalized;
4782 }
4783 
4784 LegalizerHelper::LegalizeResult
4785 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4786                                        LLT MoreTy) {
4787   assert(TypeIdx == 0 && "Expecting only Idx 0");
4788 
4789   Observer.changingInstr(MI);
4790   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4791     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4792     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4793     moreElementsVectorSrc(MI, MoreTy, I);
4794   }
4795 
4796   MachineBasicBlock &MBB = *MI.getParent();
4797   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4798   moreElementsVectorDst(MI, MoreTy, 0);
4799   Observer.changedInstr(MI);
4800   return Legalized;
4801 }
4802 
4803 LegalizerHelper::LegalizeResult
4804 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4805                                     LLT MoreTy) {
4806   unsigned Opc = MI.getOpcode();
4807   switch (Opc) {
4808   case TargetOpcode::G_IMPLICIT_DEF:
4809   case TargetOpcode::G_LOAD: {
4810     if (TypeIdx != 0)
4811       return UnableToLegalize;
4812     Observer.changingInstr(MI);
4813     moreElementsVectorDst(MI, MoreTy, 0);
4814     Observer.changedInstr(MI);
4815     return Legalized;
4816   }
4817   case TargetOpcode::G_STORE:
4818     if (TypeIdx != 0)
4819       return UnableToLegalize;
4820     Observer.changingInstr(MI);
4821     moreElementsVectorSrc(MI, MoreTy, 0);
4822     Observer.changedInstr(MI);
4823     return Legalized;
4824   case TargetOpcode::G_AND:
4825   case TargetOpcode::G_OR:
4826   case TargetOpcode::G_XOR:
4827   case TargetOpcode::G_ADD:
4828   case TargetOpcode::G_SUB:
4829   case TargetOpcode::G_MUL:
4830   case TargetOpcode::G_FADD:
4831   case TargetOpcode::G_FMUL:
4832   case TargetOpcode::G_UADDSAT:
4833   case TargetOpcode::G_USUBSAT:
4834   case TargetOpcode::G_SADDSAT:
4835   case TargetOpcode::G_SSUBSAT:
4836   case TargetOpcode::G_SMIN:
4837   case TargetOpcode::G_SMAX:
4838   case TargetOpcode::G_UMIN:
4839   case TargetOpcode::G_UMAX:
4840   case TargetOpcode::G_FMINNUM:
4841   case TargetOpcode::G_FMAXNUM:
4842   case TargetOpcode::G_FMINNUM_IEEE:
4843   case TargetOpcode::G_FMAXNUM_IEEE:
4844   case TargetOpcode::G_FMINIMUM:
4845   case TargetOpcode::G_FMAXIMUM: {
4846     Observer.changingInstr(MI);
4847     moreElementsVectorSrc(MI, MoreTy, 1);
4848     moreElementsVectorSrc(MI, MoreTy, 2);
4849     moreElementsVectorDst(MI, MoreTy, 0);
4850     Observer.changedInstr(MI);
4851     return Legalized;
4852   }
4853   case TargetOpcode::G_FMA:
4854   case TargetOpcode::G_FSHR:
4855   case TargetOpcode::G_FSHL: {
4856     Observer.changingInstr(MI);
4857     moreElementsVectorSrc(MI, MoreTy, 1);
4858     moreElementsVectorSrc(MI, MoreTy, 2);
4859     moreElementsVectorSrc(MI, MoreTy, 3);
4860     moreElementsVectorDst(MI, MoreTy, 0);
4861     Observer.changedInstr(MI);
4862     return Legalized;
4863   }
4864   case TargetOpcode::G_EXTRACT:
4865     if (TypeIdx != 1)
4866       return UnableToLegalize;
4867     Observer.changingInstr(MI);
4868     moreElementsVectorSrc(MI, MoreTy, 1);
4869     Observer.changedInstr(MI);
4870     return Legalized;
4871   case TargetOpcode::G_INSERT:
4872   case TargetOpcode::G_FREEZE:
4873   case TargetOpcode::G_FNEG:
4874   case TargetOpcode::G_FABS:
4875   case TargetOpcode::G_BSWAP:
4876   case TargetOpcode::G_FCANONICALIZE:
4877   case TargetOpcode::G_SEXT_INREG:
4878     if (TypeIdx != 0)
4879       return UnableToLegalize;
4880     Observer.changingInstr(MI);
4881     moreElementsVectorSrc(MI, MoreTy, 1);
4882     moreElementsVectorDst(MI, MoreTy, 0);
4883     Observer.changedInstr(MI);
4884     return Legalized;
4885   case TargetOpcode::G_SELECT:
4886     if (TypeIdx != 0)
4887       return UnableToLegalize;
4888     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4889       return UnableToLegalize;
4890 
4891     Observer.changingInstr(MI);
4892     moreElementsVectorSrc(MI, MoreTy, 2);
4893     moreElementsVectorSrc(MI, MoreTy, 3);
4894     moreElementsVectorDst(MI, MoreTy, 0);
4895     Observer.changedInstr(MI);
4896     return Legalized;
4897   case TargetOpcode::G_UNMERGE_VALUES:
4898     return UnableToLegalize;
4899   case TargetOpcode::G_PHI:
4900     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4901   case TargetOpcode::G_SHUFFLE_VECTOR:
4902     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4903   case TargetOpcode::G_BUILD_VECTOR: {
4904     SmallVector<SrcOp, 8> Elts;
4905     for (auto Op : MI.uses()) {
4906       Elts.push_back(Op.getReg());
4907     }
4908 
4909     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4910       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4911     }
4912 
4913     MIRBuilder.buildDeleteTrailingVectorElements(
4914         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4915     MI.eraseFromParent();
4916     return Legalized;
4917   }
4918   case TargetOpcode::G_TRUNC: {
4919     Observer.changingInstr(MI);
4920     moreElementsVectorSrc(MI, MoreTy, 1);
4921     moreElementsVectorDst(MI, MoreTy, 0);
4922     Observer.changedInstr(MI);
4923     return Legalized;
4924   }
4925   default:
4926     return UnableToLegalize;
4927   }
4928 }
4929 
4930 LegalizerHelper::LegalizeResult
4931 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
4932                                            unsigned int TypeIdx, LLT MoreTy) {
4933   if (TypeIdx != 0)
4934     return UnableToLegalize;
4935 
4936   Register DstReg = MI.getOperand(0).getReg();
4937   Register Src1Reg = MI.getOperand(1).getReg();
4938   Register Src2Reg = MI.getOperand(2).getReg();
4939   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4940   LLT DstTy = MRI.getType(DstReg);
4941   LLT Src1Ty = MRI.getType(Src1Reg);
4942   LLT Src2Ty = MRI.getType(Src2Reg);
4943   unsigned NumElts = DstTy.getNumElements();
4944   unsigned WidenNumElts = MoreTy.getNumElements();
4945 
4946   // Expect a canonicalized shuffle.
4947   if (DstTy != Src1Ty || DstTy != Src2Ty)
4948     return UnableToLegalize;
4949 
4950   moreElementsVectorSrc(MI, MoreTy, 1);
4951   moreElementsVectorSrc(MI, MoreTy, 2);
4952 
4953   // Adjust mask based on new input vector length.
4954   SmallVector<int, 16> NewMask;
4955   for (unsigned I = 0; I != NumElts; ++I) {
4956     int Idx = Mask[I];
4957     if (Idx < static_cast<int>(NumElts))
4958       NewMask.push_back(Idx);
4959     else
4960       NewMask.push_back(Idx - NumElts + WidenNumElts);
4961   }
4962   for (unsigned I = NumElts; I != WidenNumElts; ++I)
4963     NewMask.push_back(-1);
4964   moreElementsVectorDst(MI, MoreTy, 0);
4965   MIRBuilder.setInstrAndDebugLoc(MI);
4966   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4967                                 MI.getOperand(1).getReg(),
4968                                 MI.getOperand(2).getReg(), NewMask);
4969   MI.eraseFromParent();
4970   return Legalized;
4971 }
4972 
4973 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
4974                                         ArrayRef<Register> Src1Regs,
4975                                         ArrayRef<Register> Src2Regs,
4976                                         LLT NarrowTy) {
4977   MachineIRBuilder &B = MIRBuilder;
4978   unsigned SrcParts = Src1Regs.size();
4979   unsigned DstParts = DstRegs.size();
4980 
4981   unsigned DstIdx = 0; // Low bits of the result.
4982   Register FactorSum =
4983       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
4984   DstRegs[DstIdx] = FactorSum;
4985 
4986   unsigned CarrySumPrevDstIdx;
4987   SmallVector<Register, 4> Factors;
4988 
4989   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
4990     // Collect low parts of muls for DstIdx.
4991     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
4992          i <= std::min(DstIdx, SrcParts - 1); ++i) {
4993       MachineInstrBuilder Mul =
4994           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
4995       Factors.push_back(Mul.getReg(0));
4996     }
4997     // Collect high parts of muls from previous DstIdx.
4998     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
4999          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5000       MachineInstrBuilder Umulh =
5001           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5002       Factors.push_back(Umulh.getReg(0));
5003     }
5004     // Add CarrySum from additions calculated for previous DstIdx.
5005     if (DstIdx != 1) {
5006       Factors.push_back(CarrySumPrevDstIdx);
5007     }
5008 
5009     Register CarrySum;
5010     // Add all factors and accumulate all carries into CarrySum.
5011     if (DstIdx != DstParts - 1) {
5012       MachineInstrBuilder Uaddo =
5013           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5014       FactorSum = Uaddo.getReg(0);
5015       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5016       for (unsigned i = 2; i < Factors.size(); ++i) {
5017         MachineInstrBuilder Uaddo =
5018             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5019         FactorSum = Uaddo.getReg(0);
5020         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5021         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5022       }
5023     } else {
5024       // Since value for the next index is not calculated, neither is CarrySum.
5025       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5026       for (unsigned i = 2; i < Factors.size(); ++i)
5027         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5028     }
5029 
5030     CarrySumPrevDstIdx = CarrySum;
5031     DstRegs[DstIdx] = FactorSum;
5032     Factors.clear();
5033   }
5034 }
5035 
5036 LegalizerHelper::LegalizeResult
5037 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5038                                     LLT NarrowTy) {
5039   if (TypeIdx != 0)
5040     return UnableToLegalize;
5041 
5042   Register DstReg = MI.getOperand(0).getReg();
5043   LLT DstType = MRI.getType(DstReg);
5044   // FIXME: add support for vector types
5045   if (DstType.isVector())
5046     return UnableToLegalize;
5047 
5048   unsigned Opcode = MI.getOpcode();
5049   unsigned OpO, OpE, OpF;
5050   switch (Opcode) {
5051   case TargetOpcode::G_SADDO:
5052   case TargetOpcode::G_SADDE:
5053   case TargetOpcode::G_UADDO:
5054   case TargetOpcode::G_UADDE:
5055   case TargetOpcode::G_ADD:
5056     OpO = TargetOpcode::G_UADDO;
5057     OpE = TargetOpcode::G_UADDE;
5058     OpF = TargetOpcode::G_UADDE;
5059     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5060       OpF = TargetOpcode::G_SADDE;
5061     break;
5062   case TargetOpcode::G_SSUBO:
5063   case TargetOpcode::G_SSUBE:
5064   case TargetOpcode::G_USUBO:
5065   case TargetOpcode::G_USUBE:
5066   case TargetOpcode::G_SUB:
5067     OpO = TargetOpcode::G_USUBO;
5068     OpE = TargetOpcode::G_USUBE;
5069     OpF = TargetOpcode::G_USUBE;
5070     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5071       OpF = TargetOpcode::G_SSUBE;
5072     break;
5073   default:
5074     llvm_unreachable("Unexpected add/sub opcode!");
5075   }
5076 
5077   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5078   unsigned NumDefs = MI.getNumExplicitDefs();
5079   Register Src1 = MI.getOperand(NumDefs).getReg();
5080   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5081   Register CarryDst, CarryIn;
5082   if (NumDefs == 2)
5083     CarryDst = MI.getOperand(1).getReg();
5084   if (MI.getNumOperands() == NumDefs + 3)
5085     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5086 
5087   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5088   LLT LeftoverTy, DummyTy;
5089   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5090   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5091   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5092 
5093   int NarrowParts = Src1Regs.size();
5094   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5095     Src1Regs.push_back(Src1Left[I]);
5096     Src2Regs.push_back(Src2Left[I]);
5097   }
5098   DstRegs.reserve(Src1Regs.size());
5099 
5100   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5101     Register DstReg =
5102         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5103     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5104     // Forward the final carry-out to the destination register
5105     if (i == e - 1 && CarryDst)
5106       CarryOut = CarryDst;
5107 
5108     if (!CarryIn) {
5109       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5110                             {Src1Regs[i], Src2Regs[i]});
5111     } else if (i == e - 1) {
5112       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5113                             {Src1Regs[i], Src2Regs[i], CarryIn});
5114     } else {
5115       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5116                             {Src1Regs[i], Src2Regs[i], CarryIn});
5117     }
5118 
5119     DstRegs.push_back(DstReg);
5120     CarryIn = CarryOut;
5121   }
5122   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5123               makeArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5124               makeArrayRef(DstRegs).drop_front(NarrowParts));
5125 
5126   MI.eraseFromParent();
5127   return Legalized;
5128 }
5129 
5130 LegalizerHelper::LegalizeResult
5131 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5132   Register DstReg = MI.getOperand(0).getReg();
5133   Register Src1 = MI.getOperand(1).getReg();
5134   Register Src2 = MI.getOperand(2).getReg();
5135 
5136   LLT Ty = MRI.getType(DstReg);
5137   if (Ty.isVector())
5138     return UnableToLegalize;
5139 
5140   unsigned Size = Ty.getSizeInBits();
5141   unsigned NarrowSize = NarrowTy.getSizeInBits();
5142   if (Size % NarrowSize != 0)
5143     return UnableToLegalize;
5144 
5145   unsigned NumParts = Size / NarrowSize;
5146   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5147   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5148 
5149   SmallVector<Register, 2> Src1Parts, Src2Parts;
5150   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5151   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5152   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5153   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5154 
5155   // Take only high half of registers if this is high mul.
5156   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5157   MIRBuilder.buildMerge(DstReg, DstRegs);
5158   MI.eraseFromParent();
5159   return Legalized;
5160 }
5161 
5162 LegalizerHelper::LegalizeResult
5163 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5164                                    LLT NarrowTy) {
5165   if (TypeIdx != 0)
5166     return UnableToLegalize;
5167 
5168   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5169 
5170   Register Src = MI.getOperand(1).getReg();
5171   LLT SrcTy = MRI.getType(Src);
5172 
5173   // If all finite floats fit into the narrowed integer type, we can just swap
5174   // out the result type. This is practically only useful for conversions from
5175   // half to at least 16-bits, so just handle the one case.
5176   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5177       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5178     return UnableToLegalize;
5179 
5180   Observer.changingInstr(MI);
5181   narrowScalarDst(MI, NarrowTy, 0,
5182                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5183   Observer.changedInstr(MI);
5184   return Legalized;
5185 }
5186 
5187 LegalizerHelper::LegalizeResult
5188 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5189                                      LLT NarrowTy) {
5190   if (TypeIdx != 1)
5191     return UnableToLegalize;
5192 
5193   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5194 
5195   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5196   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5197   // NarrowSize.
5198   if (SizeOp1 % NarrowSize != 0)
5199     return UnableToLegalize;
5200   int NumParts = SizeOp1 / NarrowSize;
5201 
5202   SmallVector<Register, 2> SrcRegs, DstRegs;
5203   SmallVector<uint64_t, 2> Indexes;
5204   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5205 
5206   Register OpReg = MI.getOperand(0).getReg();
5207   uint64_t OpStart = MI.getOperand(2).getImm();
5208   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5209   for (int i = 0; i < NumParts; ++i) {
5210     unsigned SrcStart = i * NarrowSize;
5211 
5212     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5213       // No part of the extract uses this subregister, ignore it.
5214       continue;
5215     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5216       // The entire subregister is extracted, forward the value.
5217       DstRegs.push_back(SrcRegs[i]);
5218       continue;
5219     }
5220 
5221     // OpSegStart is where this destination segment would start in OpReg if it
5222     // extended infinitely in both directions.
5223     int64_t ExtractOffset;
5224     uint64_t SegSize;
5225     if (OpStart < SrcStart) {
5226       ExtractOffset = 0;
5227       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5228     } else {
5229       ExtractOffset = OpStart - SrcStart;
5230       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5231     }
5232 
5233     Register SegReg = SrcRegs[i];
5234     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5235       // A genuine extract is needed.
5236       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5237       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5238     }
5239 
5240     DstRegs.push_back(SegReg);
5241   }
5242 
5243   Register DstReg = MI.getOperand(0).getReg();
5244   if (MRI.getType(DstReg).isVector())
5245     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5246   else if (DstRegs.size() > 1)
5247     MIRBuilder.buildMerge(DstReg, DstRegs);
5248   else
5249     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5250   MI.eraseFromParent();
5251   return Legalized;
5252 }
5253 
5254 LegalizerHelper::LegalizeResult
5255 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5256                                     LLT NarrowTy) {
5257   // FIXME: Don't know how to handle secondary types yet.
5258   if (TypeIdx != 0)
5259     return UnableToLegalize;
5260 
5261   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5262   SmallVector<uint64_t, 2> Indexes;
5263   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5264   LLT LeftoverTy;
5265   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5266                LeftoverRegs);
5267 
5268   for (Register Reg : LeftoverRegs)
5269     SrcRegs.push_back(Reg);
5270 
5271   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5272   Register OpReg = MI.getOperand(2).getReg();
5273   uint64_t OpStart = MI.getOperand(3).getImm();
5274   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5275   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5276     unsigned DstStart = I * NarrowSize;
5277 
5278     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5279       // The entire subregister is defined by this insert, forward the new
5280       // value.
5281       DstRegs.push_back(OpReg);
5282       continue;
5283     }
5284 
5285     Register SrcReg = SrcRegs[I];
5286     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5287       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5288       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5289       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5290     }
5291 
5292     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5293       // No part of the insert affects this subregister, forward the original.
5294       DstRegs.push_back(SrcReg);
5295       continue;
5296     }
5297 
5298     // OpSegStart is where this destination segment would start in OpReg if it
5299     // extended infinitely in both directions.
5300     int64_t ExtractOffset, InsertOffset;
5301     uint64_t SegSize;
5302     if (OpStart < DstStart) {
5303       InsertOffset = 0;
5304       ExtractOffset = DstStart - OpStart;
5305       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5306     } else {
5307       InsertOffset = OpStart - DstStart;
5308       ExtractOffset = 0;
5309       SegSize =
5310         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5311     }
5312 
5313     Register SegReg = OpReg;
5314     if (ExtractOffset != 0 || SegSize != OpSize) {
5315       // A genuine extract is needed.
5316       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5317       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5318     }
5319 
5320     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5321     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5322     DstRegs.push_back(DstReg);
5323   }
5324 
5325   uint64_t WideSize = DstRegs.size() * NarrowSize;
5326   Register DstReg = MI.getOperand(0).getReg();
5327   if (WideSize > RegTy.getSizeInBits()) {
5328     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5329     MIRBuilder.buildMerge(MergeReg, DstRegs);
5330     MIRBuilder.buildTrunc(DstReg, MergeReg);
5331   } else
5332     MIRBuilder.buildMerge(DstReg, DstRegs);
5333 
5334   MI.eraseFromParent();
5335   return Legalized;
5336 }
5337 
5338 LegalizerHelper::LegalizeResult
5339 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5340                                    LLT NarrowTy) {
5341   Register DstReg = MI.getOperand(0).getReg();
5342   LLT DstTy = MRI.getType(DstReg);
5343 
5344   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5345 
5346   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5347   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5348   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5349   LLT LeftoverTy;
5350   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5351                     Src0Regs, Src0LeftoverRegs))
5352     return UnableToLegalize;
5353 
5354   LLT Unused;
5355   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5356                     Src1Regs, Src1LeftoverRegs))
5357     llvm_unreachable("inconsistent extractParts result");
5358 
5359   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5360     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5361                                         {Src0Regs[I], Src1Regs[I]});
5362     DstRegs.push_back(Inst.getReg(0));
5363   }
5364 
5365   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5366     auto Inst = MIRBuilder.buildInstr(
5367       MI.getOpcode(),
5368       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5369     DstLeftoverRegs.push_back(Inst.getReg(0));
5370   }
5371 
5372   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5373               LeftoverTy, DstLeftoverRegs);
5374 
5375   MI.eraseFromParent();
5376   return Legalized;
5377 }
5378 
5379 LegalizerHelper::LegalizeResult
5380 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5381                                  LLT NarrowTy) {
5382   if (TypeIdx != 0)
5383     return UnableToLegalize;
5384 
5385   Register DstReg = MI.getOperand(0).getReg();
5386   Register SrcReg = MI.getOperand(1).getReg();
5387 
5388   LLT DstTy = MRI.getType(DstReg);
5389   if (DstTy.isVector())
5390     return UnableToLegalize;
5391 
5392   SmallVector<Register, 8> Parts;
5393   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5394   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5395   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5396 
5397   MI.eraseFromParent();
5398   return Legalized;
5399 }
5400 
5401 LegalizerHelper::LegalizeResult
5402 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5403                                     LLT NarrowTy) {
5404   if (TypeIdx != 0)
5405     return UnableToLegalize;
5406 
5407   Register CondReg = MI.getOperand(1).getReg();
5408   LLT CondTy = MRI.getType(CondReg);
5409   if (CondTy.isVector()) // TODO: Handle vselect
5410     return UnableToLegalize;
5411 
5412   Register DstReg = MI.getOperand(0).getReg();
5413   LLT DstTy = MRI.getType(DstReg);
5414 
5415   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5416   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5417   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5418   LLT LeftoverTy;
5419   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5420                     Src1Regs, Src1LeftoverRegs))
5421     return UnableToLegalize;
5422 
5423   LLT Unused;
5424   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5425                     Src2Regs, Src2LeftoverRegs))
5426     llvm_unreachable("inconsistent extractParts result");
5427 
5428   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5429     auto Select = MIRBuilder.buildSelect(NarrowTy,
5430                                          CondReg, Src1Regs[I], Src2Regs[I]);
5431     DstRegs.push_back(Select.getReg(0));
5432   }
5433 
5434   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5435     auto Select = MIRBuilder.buildSelect(
5436       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5437     DstLeftoverRegs.push_back(Select.getReg(0));
5438   }
5439 
5440   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5441               LeftoverTy, DstLeftoverRegs);
5442 
5443   MI.eraseFromParent();
5444   return Legalized;
5445 }
5446 
5447 LegalizerHelper::LegalizeResult
5448 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5449                                   LLT NarrowTy) {
5450   if (TypeIdx != 1)
5451     return UnableToLegalize;
5452 
5453   Register DstReg = MI.getOperand(0).getReg();
5454   Register SrcReg = MI.getOperand(1).getReg();
5455   LLT DstTy = MRI.getType(DstReg);
5456   LLT SrcTy = MRI.getType(SrcReg);
5457   unsigned NarrowSize = NarrowTy.getSizeInBits();
5458 
5459   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5460     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5461 
5462     MachineIRBuilder &B = MIRBuilder;
5463     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5464     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5465     auto C_0 = B.buildConstant(NarrowTy, 0);
5466     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5467                                 UnmergeSrc.getReg(1), C_0);
5468     auto LoCTLZ = IsUndef ?
5469       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5470       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5471     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5472     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5473     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5474     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5475 
5476     MI.eraseFromParent();
5477     return Legalized;
5478   }
5479 
5480   return UnableToLegalize;
5481 }
5482 
5483 LegalizerHelper::LegalizeResult
5484 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5485                                   LLT NarrowTy) {
5486   if (TypeIdx != 1)
5487     return UnableToLegalize;
5488 
5489   Register DstReg = MI.getOperand(0).getReg();
5490   Register SrcReg = MI.getOperand(1).getReg();
5491   LLT DstTy = MRI.getType(DstReg);
5492   LLT SrcTy = MRI.getType(SrcReg);
5493   unsigned NarrowSize = NarrowTy.getSizeInBits();
5494 
5495   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5496     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5497 
5498     MachineIRBuilder &B = MIRBuilder;
5499     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5500     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5501     auto C_0 = B.buildConstant(NarrowTy, 0);
5502     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5503                                 UnmergeSrc.getReg(0), C_0);
5504     auto HiCTTZ = IsUndef ?
5505       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5506       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5507     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5508     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5509     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5510     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5511 
5512     MI.eraseFromParent();
5513     return Legalized;
5514   }
5515 
5516   return UnableToLegalize;
5517 }
5518 
5519 LegalizerHelper::LegalizeResult
5520 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5521                                    LLT NarrowTy) {
5522   if (TypeIdx != 1)
5523     return UnableToLegalize;
5524 
5525   Register DstReg = MI.getOperand(0).getReg();
5526   LLT DstTy = MRI.getType(DstReg);
5527   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
5528   unsigned NarrowSize = NarrowTy.getSizeInBits();
5529 
5530   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5531     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5532 
5533     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5534     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5535     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5536 
5537     MI.eraseFromParent();
5538     return Legalized;
5539   }
5540 
5541   return UnableToLegalize;
5542 }
5543 
5544 LegalizerHelper::LegalizeResult
5545 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5546   unsigned Opc = MI.getOpcode();
5547   const auto &TII = MIRBuilder.getTII();
5548   auto isSupported = [this](const LegalityQuery &Q) {
5549     auto QAction = LI.getAction(Q).Action;
5550     return QAction == Legal || QAction == Libcall || QAction == Custom;
5551   };
5552   switch (Opc) {
5553   default:
5554     return UnableToLegalize;
5555   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5556     // This trivially expands to CTLZ.
5557     Observer.changingInstr(MI);
5558     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5559     Observer.changedInstr(MI);
5560     return Legalized;
5561   }
5562   case TargetOpcode::G_CTLZ: {
5563     Register DstReg = MI.getOperand(0).getReg();
5564     Register SrcReg = MI.getOperand(1).getReg();
5565     LLT DstTy = MRI.getType(DstReg);
5566     LLT SrcTy = MRI.getType(SrcReg);
5567     unsigned Len = SrcTy.getSizeInBits();
5568 
5569     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5570       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5571       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5572       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5573       auto ICmp = MIRBuilder.buildICmp(
5574           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5575       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5576       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5577       MI.eraseFromParent();
5578       return Legalized;
5579     }
5580     // for now, we do this:
5581     // NewLen = NextPowerOf2(Len);
5582     // x = x | (x >> 1);
5583     // x = x | (x >> 2);
5584     // ...
5585     // x = x | (x >>16);
5586     // x = x | (x >>32); // for 64-bit input
5587     // Upto NewLen/2
5588     // return Len - popcount(x);
5589     //
5590     // Ref: "Hacker's Delight" by Henry Warren
5591     Register Op = SrcReg;
5592     unsigned NewLen = PowerOf2Ceil(Len);
5593     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5594       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5595       auto MIBOp = MIRBuilder.buildOr(
5596           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5597       Op = MIBOp.getReg(0);
5598     }
5599     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5600     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5601                         MIBPop);
5602     MI.eraseFromParent();
5603     return Legalized;
5604   }
5605   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5606     // This trivially expands to CTTZ.
5607     Observer.changingInstr(MI);
5608     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5609     Observer.changedInstr(MI);
5610     return Legalized;
5611   }
5612   case TargetOpcode::G_CTTZ: {
5613     Register DstReg = MI.getOperand(0).getReg();
5614     Register SrcReg = MI.getOperand(1).getReg();
5615     LLT DstTy = MRI.getType(DstReg);
5616     LLT SrcTy = MRI.getType(SrcReg);
5617 
5618     unsigned Len = SrcTy.getSizeInBits();
5619     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5620       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5621       // zero.
5622       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5623       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5624       auto ICmp = MIRBuilder.buildICmp(
5625           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5626       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5627       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5628       MI.eraseFromParent();
5629       return Legalized;
5630     }
5631     // for now, we use: { return popcount(~x & (x - 1)); }
5632     // unless the target has ctlz but not ctpop, in which case we use:
5633     // { return 32 - nlz(~x & (x-1)); }
5634     // Ref: "Hacker's Delight" by Henry Warren
5635     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5636     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5637     auto MIBTmp = MIRBuilder.buildAnd(
5638         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5639     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5640         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5641       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5642       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5643                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5644       MI.eraseFromParent();
5645       return Legalized;
5646     }
5647     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5648     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5649     return Legalized;
5650   }
5651   case TargetOpcode::G_CTPOP: {
5652     Register SrcReg = MI.getOperand(1).getReg();
5653     LLT Ty = MRI.getType(SrcReg);
5654     unsigned Size = Ty.getSizeInBits();
5655     MachineIRBuilder &B = MIRBuilder;
5656 
5657     // Count set bits in blocks of 2 bits. Default approach would be
5658     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5659     // We use following formula instead:
5660     // B2Count = val - { (val >> 1) & 0x55555555 }
5661     // since it gives same result in blocks of 2 with one instruction less.
5662     auto C_1 = B.buildConstant(Ty, 1);
5663     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5664     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5665     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5666     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5667     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5668 
5669     // In order to get count in blocks of 4 add values from adjacent block of 2.
5670     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5671     auto C_2 = B.buildConstant(Ty, 2);
5672     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5673     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5674     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5675     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5676     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5677     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5678 
5679     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5680     // addition since count value sits in range {0,...,8} and 4 bits are enough
5681     // to hold such binary values. After addition high 4 bits still hold count
5682     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5683     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5684     auto C_4 = B.buildConstant(Ty, 4);
5685     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5686     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5687     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5688     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5689     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5690 
5691     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5692     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5693     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5694     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5695     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5696 
5697     // Shift count result from 8 high bits to low bits.
5698     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5699     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5700 
5701     MI.eraseFromParent();
5702     return Legalized;
5703   }
5704   }
5705 }
5706 
5707 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5708 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5709                                         Register Reg, unsigned BW) {
5710   return matchUnaryPredicate(
5711       MRI, Reg,
5712       [=](const Constant *C) {
5713         // Null constant here means an undef.
5714         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5715         return !CI || CI->getValue().urem(BW) != 0;
5716       },
5717       /*AllowUndefs*/ true);
5718 }
5719 
5720 LegalizerHelper::LegalizeResult
5721 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5722   Register Dst = MI.getOperand(0).getReg();
5723   Register X = MI.getOperand(1).getReg();
5724   Register Y = MI.getOperand(2).getReg();
5725   Register Z = MI.getOperand(3).getReg();
5726   LLT Ty = MRI.getType(Dst);
5727   LLT ShTy = MRI.getType(Z);
5728 
5729   unsigned BW = Ty.getScalarSizeInBits();
5730 
5731   if (!isPowerOf2_32(BW))
5732     return UnableToLegalize;
5733 
5734   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5735   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5736 
5737   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5738     // fshl X, Y, Z -> fshr X, Y, -Z
5739     // fshr X, Y, Z -> fshl X, Y, -Z
5740     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5741     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5742   } else {
5743     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5744     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5745     auto One = MIRBuilder.buildConstant(ShTy, 1);
5746     if (IsFSHL) {
5747       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5748       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5749     } else {
5750       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5751       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5752     }
5753 
5754     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5755   }
5756 
5757   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5758   MI.eraseFromParent();
5759   return Legalized;
5760 }
5761 
5762 LegalizerHelper::LegalizeResult
5763 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5764   Register Dst = MI.getOperand(0).getReg();
5765   Register X = MI.getOperand(1).getReg();
5766   Register Y = MI.getOperand(2).getReg();
5767   Register Z = MI.getOperand(3).getReg();
5768   LLT Ty = MRI.getType(Dst);
5769   LLT ShTy = MRI.getType(Z);
5770 
5771   const unsigned BW = Ty.getScalarSizeInBits();
5772   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5773 
5774   Register ShX, ShY;
5775   Register ShAmt, InvShAmt;
5776 
5777   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5778   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5779     // fshl: X << C | Y >> (BW - C)
5780     // fshr: X << (BW - C) | Y >> C
5781     // where C = Z % BW is not zero
5782     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5783     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5784     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5785     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5786     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5787   } else {
5788     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5789     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5790     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5791     if (isPowerOf2_32(BW)) {
5792       // Z % BW -> Z & (BW - 1)
5793       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5794       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5795       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5796       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5797     } else {
5798       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5799       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5800       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5801     }
5802 
5803     auto One = MIRBuilder.buildConstant(ShTy, 1);
5804     if (IsFSHL) {
5805       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5806       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5807       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5808     } else {
5809       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5810       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5811       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5812     }
5813   }
5814 
5815   MIRBuilder.buildOr(Dst, ShX, ShY);
5816   MI.eraseFromParent();
5817   return Legalized;
5818 }
5819 
5820 LegalizerHelper::LegalizeResult
5821 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5822   // These operations approximately do the following (while avoiding undefined
5823   // shifts by BW):
5824   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5825   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5826   Register Dst = MI.getOperand(0).getReg();
5827   LLT Ty = MRI.getType(Dst);
5828   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5829 
5830   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5831   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5832 
5833   // TODO: Use smarter heuristic that accounts for vector legalization.
5834   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5835     return lowerFunnelShiftAsShifts(MI);
5836 
5837   // This only works for powers of 2, fallback to shifts if it fails.
5838   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5839   if (Result == UnableToLegalize)
5840     return lowerFunnelShiftAsShifts(MI);
5841   return Result;
5842 }
5843 
5844 LegalizerHelper::LegalizeResult
5845 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5846   Register Dst = MI.getOperand(0).getReg();
5847   Register Src = MI.getOperand(1).getReg();
5848   Register Amt = MI.getOperand(2).getReg();
5849   LLT AmtTy = MRI.getType(Amt);
5850   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5851   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5852   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5853   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5854   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5855   MI.eraseFromParent();
5856   return Legalized;
5857 }
5858 
5859 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5860   Register Dst = MI.getOperand(0).getReg();
5861   Register Src = MI.getOperand(1).getReg();
5862   Register Amt = MI.getOperand(2).getReg();
5863   LLT DstTy = MRI.getType(Dst);
5864   LLT SrcTy = MRI.getType(Src);
5865   LLT AmtTy = MRI.getType(Amt);
5866 
5867   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5868   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5869 
5870   MIRBuilder.setInstrAndDebugLoc(MI);
5871 
5872   // If a rotate in the other direction is supported, use it.
5873   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5874   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5875       isPowerOf2_32(EltSizeInBits))
5876     return lowerRotateWithReverseRotate(MI);
5877 
5878   // If a funnel shift is supported, use it.
5879   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5880   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5881   bool IsFShLegal = false;
5882   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5883       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5884     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5885                                 Register R3) {
5886       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5887       MI.eraseFromParent();
5888       return Legalized;
5889     };
5890     // If a funnel shift in the other direction is supported, use it.
5891     if (IsFShLegal) {
5892       return buildFunnelShift(FShOpc, Dst, Src, Amt);
5893     } else if (isPowerOf2_32(EltSizeInBits)) {
5894       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
5895       return buildFunnelShift(RevFsh, Dst, Src, Amt);
5896     }
5897   }
5898 
5899   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5900   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
5901   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
5902   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
5903   Register ShVal;
5904   Register RevShiftVal;
5905   if (isPowerOf2_32(EltSizeInBits)) {
5906     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
5907     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
5908     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5909     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
5910     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5911     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
5912     RevShiftVal =
5913         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
5914   } else {
5915     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
5916     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
5917     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
5918     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
5919     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
5920     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
5921     auto One = MIRBuilder.buildConstant(AmtTy, 1);
5922     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
5923     RevShiftVal =
5924         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
5925   }
5926   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
5927   MI.eraseFromParent();
5928   return Legalized;
5929 }
5930 
5931 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
5932 // representation.
5933 LegalizerHelper::LegalizeResult
5934 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
5935   Register Dst = MI.getOperand(0).getReg();
5936   Register Src = MI.getOperand(1).getReg();
5937   const LLT S64 = LLT::scalar(64);
5938   const LLT S32 = LLT::scalar(32);
5939   const LLT S1 = LLT::scalar(1);
5940 
5941   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
5942 
5943   // unsigned cul2f(ulong u) {
5944   //   uint lz = clz(u);
5945   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
5946   //   u = (u << lz) & 0x7fffffffffffffffUL;
5947   //   ulong t = u & 0xffffffffffUL;
5948   //   uint v = (e << 23) | (uint)(u >> 40);
5949   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
5950   //   return as_float(v + r);
5951   // }
5952 
5953   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
5954   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
5955 
5956   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
5957 
5958   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
5959   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
5960 
5961   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
5962   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
5963 
5964   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
5965   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
5966 
5967   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
5968 
5969   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
5970   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
5971 
5972   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
5973   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
5974   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
5975 
5976   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
5977   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
5978   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
5979   auto One = MIRBuilder.buildConstant(S32, 1);
5980 
5981   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
5982   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
5983   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
5984   MIRBuilder.buildAdd(Dst, V, R);
5985 
5986   MI.eraseFromParent();
5987   return Legalized;
5988 }
5989 
5990 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
5991   Register Dst = MI.getOperand(0).getReg();
5992   Register Src = MI.getOperand(1).getReg();
5993   LLT DstTy = MRI.getType(Dst);
5994   LLT SrcTy = MRI.getType(Src);
5995 
5996   if (SrcTy == LLT::scalar(1)) {
5997     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
5998     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
5999     MIRBuilder.buildSelect(Dst, Src, True, False);
6000     MI.eraseFromParent();
6001     return Legalized;
6002   }
6003 
6004   if (SrcTy != LLT::scalar(64))
6005     return UnableToLegalize;
6006 
6007   if (DstTy == LLT::scalar(32)) {
6008     // TODO: SelectionDAG has several alternative expansions to port which may
6009     // be more reasonble depending on the available instructions. If a target
6010     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6011     // intermediate type, this is probably worse.
6012     return lowerU64ToF32BitOps(MI);
6013   }
6014 
6015   return UnableToLegalize;
6016 }
6017 
6018 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6019   Register Dst = MI.getOperand(0).getReg();
6020   Register Src = MI.getOperand(1).getReg();
6021   LLT DstTy = MRI.getType(Dst);
6022   LLT SrcTy = MRI.getType(Src);
6023 
6024   const LLT S64 = LLT::scalar(64);
6025   const LLT S32 = LLT::scalar(32);
6026   const LLT S1 = LLT::scalar(1);
6027 
6028   if (SrcTy == S1) {
6029     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6030     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6031     MIRBuilder.buildSelect(Dst, Src, True, False);
6032     MI.eraseFromParent();
6033     return Legalized;
6034   }
6035 
6036   if (SrcTy != S64)
6037     return UnableToLegalize;
6038 
6039   if (DstTy == S32) {
6040     // signed cl2f(long l) {
6041     //   long s = l >> 63;
6042     //   float r = cul2f((l + s) ^ s);
6043     //   return s ? -r : r;
6044     // }
6045     Register L = Src;
6046     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6047     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6048 
6049     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6050     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6051     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6052 
6053     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6054     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6055                                             MIRBuilder.buildConstant(S64, 0));
6056     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6057     MI.eraseFromParent();
6058     return Legalized;
6059   }
6060 
6061   return UnableToLegalize;
6062 }
6063 
6064 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6065   Register Dst = MI.getOperand(0).getReg();
6066   Register Src = MI.getOperand(1).getReg();
6067   LLT DstTy = MRI.getType(Dst);
6068   LLT SrcTy = MRI.getType(Src);
6069   const LLT S64 = LLT::scalar(64);
6070   const LLT S32 = LLT::scalar(32);
6071 
6072   if (SrcTy != S64 && SrcTy != S32)
6073     return UnableToLegalize;
6074   if (DstTy != S32 && DstTy != S64)
6075     return UnableToLegalize;
6076 
6077   // FPTOSI gives same result as FPTOUI for positive signed integers.
6078   // FPTOUI needs to deal with fp values that convert to unsigned integers
6079   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6080 
6081   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6082   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6083                                                 : APFloat::IEEEdouble(),
6084                     APInt::getZero(SrcTy.getSizeInBits()));
6085   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6086 
6087   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6088 
6089   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6090   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6091   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6092   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6093   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6094   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6095   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6096 
6097   const LLT S1 = LLT::scalar(1);
6098 
6099   MachineInstrBuilder FCMP =
6100       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6101   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6102 
6103   MI.eraseFromParent();
6104   return Legalized;
6105 }
6106 
6107 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6108   Register Dst = MI.getOperand(0).getReg();
6109   Register Src = MI.getOperand(1).getReg();
6110   LLT DstTy = MRI.getType(Dst);
6111   LLT SrcTy = MRI.getType(Src);
6112   const LLT S64 = LLT::scalar(64);
6113   const LLT S32 = LLT::scalar(32);
6114 
6115   // FIXME: Only f32 to i64 conversions are supported.
6116   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6117     return UnableToLegalize;
6118 
6119   // Expand f32 -> i64 conversion
6120   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6121   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6122 
6123   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6124 
6125   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6126   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6127 
6128   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6129   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6130 
6131   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6132                                            APInt::getSignMask(SrcEltBits));
6133   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6134   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6135   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6136   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6137 
6138   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6139   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6140   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6141 
6142   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6143   R = MIRBuilder.buildZExt(DstTy, R);
6144 
6145   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6146   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6147   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6148   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6149 
6150   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6151   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6152 
6153   const LLT S1 = LLT::scalar(1);
6154   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6155                                     S1, Exponent, ExponentLoBit);
6156 
6157   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6158 
6159   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6160   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6161 
6162   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6163 
6164   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6165                                           S1, Exponent, ZeroSrcTy);
6166 
6167   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6168   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6169 
6170   MI.eraseFromParent();
6171   return Legalized;
6172 }
6173 
6174 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6175 LegalizerHelper::LegalizeResult
6176 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6177   Register Dst = MI.getOperand(0).getReg();
6178   Register Src = MI.getOperand(1).getReg();
6179 
6180   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6181     return UnableToLegalize;
6182 
6183   const unsigned ExpMask = 0x7ff;
6184   const unsigned ExpBiasf64 = 1023;
6185   const unsigned ExpBiasf16 = 15;
6186   const LLT S32 = LLT::scalar(32);
6187   const LLT S1 = LLT::scalar(1);
6188 
6189   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6190   Register U = Unmerge.getReg(0);
6191   Register UH = Unmerge.getReg(1);
6192 
6193   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6194   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6195 
6196   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6197   // add the f16 bias (15) to get the biased exponent for the f16 format.
6198   E = MIRBuilder.buildAdd(
6199     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6200 
6201   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6202   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6203 
6204   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6205                                        MIRBuilder.buildConstant(S32, 0x1ff));
6206   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6207 
6208   auto Zero = MIRBuilder.buildConstant(S32, 0);
6209   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6210   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6211   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6212 
6213   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6214   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6215   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6216   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6217 
6218   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6219   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6220 
6221   // N = M | (E << 12);
6222   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6223   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6224 
6225   // B = clamp(1-E, 0, 13);
6226   auto One = MIRBuilder.buildConstant(S32, 1);
6227   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6228   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6229   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6230 
6231   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6232                                        MIRBuilder.buildConstant(S32, 0x1000));
6233 
6234   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6235   auto D0 = MIRBuilder.buildShl(S32, D, B);
6236 
6237   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6238                                              D0, SigSetHigh);
6239   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6240   D = MIRBuilder.buildOr(S32, D, D1);
6241 
6242   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6243   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6244 
6245   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6246   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6247 
6248   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6249                                        MIRBuilder.buildConstant(S32, 3));
6250   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6251 
6252   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6253                                        MIRBuilder.buildConstant(S32, 5));
6254   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6255 
6256   V1 = MIRBuilder.buildOr(S32, V0, V1);
6257   V = MIRBuilder.buildAdd(S32, V, V1);
6258 
6259   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6260                                        E, MIRBuilder.buildConstant(S32, 30));
6261   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6262                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6263 
6264   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6265                                          E, MIRBuilder.buildConstant(S32, 1039));
6266   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6267 
6268   // Extract the sign bit.
6269   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6270   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6271 
6272   // Insert the sign bit
6273   V = MIRBuilder.buildOr(S32, Sign, V);
6274 
6275   MIRBuilder.buildTrunc(Dst, V);
6276   MI.eraseFromParent();
6277   return Legalized;
6278 }
6279 
6280 LegalizerHelper::LegalizeResult
6281 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6282   Register Dst = MI.getOperand(0).getReg();
6283   Register Src = MI.getOperand(1).getReg();
6284 
6285   LLT DstTy = MRI.getType(Dst);
6286   LLT SrcTy = MRI.getType(Src);
6287   const LLT S64 = LLT::scalar(64);
6288   const LLT S16 = LLT::scalar(16);
6289 
6290   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6291     return lowerFPTRUNC_F64_TO_F16(MI);
6292 
6293   return UnableToLegalize;
6294 }
6295 
6296 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6297 // multiplication tree.
6298 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6299   Register Dst = MI.getOperand(0).getReg();
6300   Register Src0 = MI.getOperand(1).getReg();
6301   Register Src1 = MI.getOperand(2).getReg();
6302   LLT Ty = MRI.getType(Dst);
6303 
6304   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6305   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6306   MI.eraseFromParent();
6307   return Legalized;
6308 }
6309 
6310 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6311   switch (Opc) {
6312   case TargetOpcode::G_SMIN:
6313     return CmpInst::ICMP_SLT;
6314   case TargetOpcode::G_SMAX:
6315     return CmpInst::ICMP_SGT;
6316   case TargetOpcode::G_UMIN:
6317     return CmpInst::ICMP_ULT;
6318   case TargetOpcode::G_UMAX:
6319     return CmpInst::ICMP_UGT;
6320   default:
6321     llvm_unreachable("not in integer min/max");
6322   }
6323 }
6324 
6325 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6326   Register Dst = MI.getOperand(0).getReg();
6327   Register Src0 = MI.getOperand(1).getReg();
6328   Register Src1 = MI.getOperand(2).getReg();
6329 
6330   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6331   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6332 
6333   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6334   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6335 
6336   MI.eraseFromParent();
6337   return Legalized;
6338 }
6339 
6340 LegalizerHelper::LegalizeResult
6341 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6342   Register Dst = MI.getOperand(0).getReg();
6343   Register Src0 = MI.getOperand(1).getReg();
6344   Register Src1 = MI.getOperand(2).getReg();
6345 
6346   const LLT Src0Ty = MRI.getType(Src0);
6347   const LLT Src1Ty = MRI.getType(Src1);
6348 
6349   const int Src0Size = Src0Ty.getScalarSizeInBits();
6350   const int Src1Size = Src1Ty.getScalarSizeInBits();
6351 
6352   auto SignBitMask = MIRBuilder.buildConstant(
6353     Src0Ty, APInt::getSignMask(Src0Size));
6354 
6355   auto NotSignBitMask = MIRBuilder.buildConstant(
6356     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6357 
6358   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6359   Register And1;
6360   if (Src0Ty == Src1Ty) {
6361     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6362   } else if (Src0Size > Src1Size) {
6363     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6364     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6365     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6366     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6367   } else {
6368     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6369     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6370     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6371     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6372   }
6373 
6374   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6375   // constants are a nan and -0.0, but the final result should preserve
6376   // everything.
6377   unsigned Flags = MI.getFlags();
6378   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6379 
6380   MI.eraseFromParent();
6381   return Legalized;
6382 }
6383 
6384 LegalizerHelper::LegalizeResult
6385 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6386   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6387     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6388 
6389   Register Dst = MI.getOperand(0).getReg();
6390   Register Src0 = MI.getOperand(1).getReg();
6391   Register Src1 = MI.getOperand(2).getReg();
6392   LLT Ty = MRI.getType(Dst);
6393 
6394   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6395     // Insert canonicalizes if it's possible we need to quiet to get correct
6396     // sNaN behavior.
6397 
6398     // Note this must be done here, and not as an optimization combine in the
6399     // absence of a dedicate quiet-snan instruction as we're using an
6400     // omni-purpose G_FCANONICALIZE.
6401     if (!isKnownNeverSNaN(Src0, MRI))
6402       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6403 
6404     if (!isKnownNeverSNaN(Src1, MRI))
6405       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6406   }
6407 
6408   // If there are no nans, it's safe to simply replace this with the non-IEEE
6409   // version.
6410   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6411   MI.eraseFromParent();
6412   return Legalized;
6413 }
6414 
6415 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6416   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6417   Register DstReg = MI.getOperand(0).getReg();
6418   LLT Ty = MRI.getType(DstReg);
6419   unsigned Flags = MI.getFlags();
6420 
6421   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6422                                   Flags);
6423   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6424   MI.eraseFromParent();
6425   return Legalized;
6426 }
6427 
6428 LegalizerHelper::LegalizeResult
6429 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6430   Register DstReg = MI.getOperand(0).getReg();
6431   Register X = MI.getOperand(1).getReg();
6432   const unsigned Flags = MI.getFlags();
6433   const LLT Ty = MRI.getType(DstReg);
6434   const LLT CondTy = Ty.changeElementSize(1);
6435 
6436   // round(x) =>
6437   //  t = trunc(x);
6438   //  d = fabs(x - t);
6439   //  o = copysign(1.0f, x);
6440   //  return t + (d >= 0.5 ? o : 0.0);
6441 
6442   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6443 
6444   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6445   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6446   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6447   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6448   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6449   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6450 
6451   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6452                                   Flags);
6453   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6454 
6455   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6456 
6457   MI.eraseFromParent();
6458   return Legalized;
6459 }
6460 
6461 LegalizerHelper::LegalizeResult
6462 LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6463   Register DstReg = MI.getOperand(0).getReg();
6464   Register SrcReg = MI.getOperand(1).getReg();
6465   unsigned Flags = MI.getFlags();
6466   LLT Ty = MRI.getType(DstReg);
6467   const LLT CondTy = Ty.changeElementSize(1);
6468 
6469   // result = trunc(src);
6470   // if (src < 0.0 && src != result)
6471   //   result += -1.0.
6472 
6473   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6474   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6475 
6476   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6477                                   SrcReg, Zero, Flags);
6478   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6479                                       SrcReg, Trunc, Flags);
6480   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6481   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6482 
6483   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6484   MI.eraseFromParent();
6485   return Legalized;
6486 }
6487 
6488 LegalizerHelper::LegalizeResult
6489 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6490   const unsigned NumOps = MI.getNumOperands();
6491   Register DstReg = MI.getOperand(0).getReg();
6492   Register Src0Reg = MI.getOperand(1).getReg();
6493   LLT DstTy = MRI.getType(DstReg);
6494   LLT SrcTy = MRI.getType(Src0Reg);
6495   unsigned PartSize = SrcTy.getSizeInBits();
6496 
6497   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6498   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6499 
6500   for (unsigned I = 2; I != NumOps; ++I) {
6501     const unsigned Offset = (I - 1) * PartSize;
6502 
6503     Register SrcReg = MI.getOperand(I).getReg();
6504     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6505 
6506     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6507       MRI.createGenericVirtualRegister(WideTy);
6508 
6509     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6510     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6511     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6512     ResultReg = NextResult;
6513   }
6514 
6515   if (DstTy.isPointer()) {
6516     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6517           DstTy.getAddressSpace())) {
6518       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6519       return UnableToLegalize;
6520     }
6521 
6522     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6523   }
6524 
6525   MI.eraseFromParent();
6526   return Legalized;
6527 }
6528 
6529 LegalizerHelper::LegalizeResult
6530 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6531   const unsigned NumDst = MI.getNumOperands() - 1;
6532   Register SrcReg = MI.getOperand(NumDst).getReg();
6533   Register Dst0Reg = MI.getOperand(0).getReg();
6534   LLT DstTy = MRI.getType(Dst0Reg);
6535   if (DstTy.isPointer())
6536     return UnableToLegalize; // TODO
6537 
6538   SrcReg = coerceToScalar(SrcReg);
6539   if (!SrcReg)
6540     return UnableToLegalize;
6541 
6542   // Expand scalarizing unmerge as bitcast to integer and shift.
6543   LLT IntTy = MRI.getType(SrcReg);
6544 
6545   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6546 
6547   const unsigned DstSize = DstTy.getSizeInBits();
6548   unsigned Offset = DstSize;
6549   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6550     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6551     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6552     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6553   }
6554 
6555   MI.eraseFromParent();
6556   return Legalized;
6557 }
6558 
6559 /// Lower a vector extract or insert by writing the vector to a stack temporary
6560 /// and reloading the element or vector.
6561 ///
6562 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6563 ///  =>
6564 ///  %stack_temp = G_FRAME_INDEX
6565 ///  G_STORE %vec, %stack_temp
6566 ///  %idx = clamp(%idx, %vec.getNumElements())
6567 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6568 ///  %dst = G_LOAD %element_ptr
6569 LegalizerHelper::LegalizeResult
6570 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6571   Register DstReg = MI.getOperand(0).getReg();
6572   Register SrcVec = MI.getOperand(1).getReg();
6573   Register InsertVal;
6574   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6575     InsertVal = MI.getOperand(2).getReg();
6576 
6577   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6578 
6579   LLT VecTy = MRI.getType(SrcVec);
6580   LLT EltTy = VecTy.getElementType();
6581   unsigned NumElts = VecTy.getNumElements();
6582 
6583   int64_t IdxVal;
6584   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6585     SmallVector<Register, 8> SrcRegs;
6586     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6587 
6588     if (InsertVal) {
6589       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6590       MIRBuilder.buildMerge(DstReg, SrcRegs);
6591     } else {
6592       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6593     }
6594 
6595     MI.eraseFromParent();
6596     return Legalized;
6597   }
6598 
6599   if (!EltTy.isByteSized()) { // Not implemented.
6600     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6601     return UnableToLegalize;
6602   }
6603 
6604   unsigned EltBytes = EltTy.getSizeInBytes();
6605   Align VecAlign = getStackTemporaryAlignment(VecTy);
6606   Align EltAlign;
6607 
6608   MachinePointerInfo PtrInfo;
6609   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6610                                         VecAlign, PtrInfo);
6611   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6612 
6613   // Get the pointer to the element, and be sure not to hit undefined behavior
6614   // if the index is out of bounds.
6615   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6616 
6617   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6618     int64_t Offset = IdxVal * EltBytes;
6619     PtrInfo = PtrInfo.getWithOffset(Offset);
6620     EltAlign = commonAlignment(VecAlign, Offset);
6621   } else {
6622     // We lose information with a variable offset.
6623     EltAlign = getStackTemporaryAlignment(EltTy);
6624     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6625   }
6626 
6627   if (InsertVal) {
6628     // Write the inserted element
6629     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6630 
6631     // Reload the whole vector.
6632     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6633   } else {
6634     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6635   }
6636 
6637   MI.eraseFromParent();
6638   return Legalized;
6639 }
6640 
6641 LegalizerHelper::LegalizeResult
6642 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6643   Register DstReg = MI.getOperand(0).getReg();
6644   Register Src0Reg = MI.getOperand(1).getReg();
6645   Register Src1Reg = MI.getOperand(2).getReg();
6646   LLT Src0Ty = MRI.getType(Src0Reg);
6647   LLT DstTy = MRI.getType(DstReg);
6648   LLT IdxTy = LLT::scalar(32);
6649 
6650   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6651 
6652   if (DstTy.isScalar()) {
6653     if (Src0Ty.isVector())
6654       return UnableToLegalize;
6655 
6656     // This is just a SELECT.
6657     assert(Mask.size() == 1 && "Expected a single mask element");
6658     Register Val;
6659     if (Mask[0] < 0 || Mask[0] > 1)
6660       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6661     else
6662       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6663     MIRBuilder.buildCopy(DstReg, Val);
6664     MI.eraseFromParent();
6665     return Legalized;
6666   }
6667 
6668   Register Undef;
6669   SmallVector<Register, 32> BuildVec;
6670   LLT EltTy = DstTy.getElementType();
6671 
6672   for (int Idx : Mask) {
6673     if (Idx < 0) {
6674       if (!Undef.isValid())
6675         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6676       BuildVec.push_back(Undef);
6677       continue;
6678     }
6679 
6680     if (Src0Ty.isScalar()) {
6681       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6682     } else {
6683       int NumElts = Src0Ty.getNumElements();
6684       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6685       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6686       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6687       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6688       BuildVec.push_back(Extract.getReg(0));
6689     }
6690   }
6691 
6692   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6693   MI.eraseFromParent();
6694   return Legalized;
6695 }
6696 
6697 LegalizerHelper::LegalizeResult
6698 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6699   const auto &MF = *MI.getMF();
6700   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6701   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6702     return UnableToLegalize;
6703 
6704   Register Dst = MI.getOperand(0).getReg();
6705   Register AllocSize = MI.getOperand(1).getReg();
6706   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6707 
6708   LLT PtrTy = MRI.getType(Dst);
6709   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6710 
6711   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6712   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6713   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6714 
6715   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6716   // have to generate an extra instruction to negate the alloc and then use
6717   // G_PTR_ADD to add the negative offset.
6718   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6719   if (Alignment > Align(1)) {
6720     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6721     AlignMask.negate();
6722     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6723     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6724   }
6725 
6726   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6727   MIRBuilder.buildCopy(SPReg, SPTmp);
6728   MIRBuilder.buildCopy(Dst, SPTmp);
6729 
6730   MI.eraseFromParent();
6731   return Legalized;
6732 }
6733 
6734 LegalizerHelper::LegalizeResult
6735 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6736   Register Dst = MI.getOperand(0).getReg();
6737   Register Src = MI.getOperand(1).getReg();
6738   unsigned Offset = MI.getOperand(2).getImm();
6739 
6740   LLT DstTy = MRI.getType(Dst);
6741   LLT SrcTy = MRI.getType(Src);
6742 
6743   // Extract sub-vector or one element
6744   if (SrcTy.isVector()) {
6745     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6746     unsigned DstSize = DstTy.getSizeInBits();
6747 
6748     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6749         (Offset + DstSize <= SrcTy.getSizeInBits())) {
6750       // Unmerge and allow access to each Src element for the artifact combiner.
6751       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src);
6752 
6753       // Take element(s) we need to extract and copy it (merge them).
6754       SmallVector<Register, 8> SubVectorElts;
6755       for (unsigned Idx = Offset / SrcEltSize;
6756            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6757         SubVectorElts.push_back(Unmerge.getReg(Idx));
6758       }
6759       if (SubVectorElts.size() == 1)
6760         MIRBuilder.buildCopy(Dst, SubVectorElts[0]);
6761       else
6762         MIRBuilder.buildMerge(Dst, SubVectorElts);
6763 
6764       MI.eraseFromParent();
6765       return Legalized;
6766     }
6767   }
6768 
6769   if (DstTy.isScalar() &&
6770       (SrcTy.isScalar() ||
6771        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6772     LLT SrcIntTy = SrcTy;
6773     if (!SrcTy.isScalar()) {
6774       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6775       Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
6776     }
6777 
6778     if (Offset == 0)
6779       MIRBuilder.buildTrunc(Dst, Src);
6780     else {
6781       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6782       auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
6783       MIRBuilder.buildTrunc(Dst, Shr);
6784     }
6785 
6786     MI.eraseFromParent();
6787     return Legalized;
6788   }
6789 
6790   return UnableToLegalize;
6791 }
6792 
6793 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6794   Register Dst = MI.getOperand(0).getReg();
6795   Register Src = MI.getOperand(1).getReg();
6796   Register InsertSrc = MI.getOperand(2).getReg();
6797   uint64_t Offset = MI.getOperand(3).getImm();
6798 
6799   LLT DstTy = MRI.getType(Src);
6800   LLT InsertTy = MRI.getType(InsertSrc);
6801 
6802   // Insert sub-vector or one element
6803   if (DstTy.isVector() && !InsertTy.isPointer()) {
6804     LLT EltTy = DstTy.getElementType();
6805     unsigned EltSize = EltTy.getSizeInBits();
6806     unsigned InsertSize = InsertTy.getSizeInBits();
6807 
6808     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6809         (Offset + InsertSize <= DstTy.getSizeInBits())) {
6810       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6811       SmallVector<Register, 8> DstElts;
6812       unsigned Idx = 0;
6813       // Elements from Src before insert start Offset
6814       for (; Idx < Offset / EltSize; ++Idx) {
6815         DstElts.push_back(UnmergeSrc.getReg(Idx));
6816       }
6817 
6818       // Replace elements in Src with elements from InsertSrc
6819       if (InsertTy.getSizeInBits() > EltSize) {
6820         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6821         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6822              ++Idx, ++i) {
6823           DstElts.push_back(UnmergeInsertSrc.getReg(i));
6824         }
6825       } else {
6826         DstElts.push_back(InsertSrc);
6827         ++Idx;
6828       }
6829 
6830       // Remaining elements from Src after insert
6831       for (; Idx < DstTy.getNumElements(); ++Idx) {
6832         DstElts.push_back(UnmergeSrc.getReg(Idx));
6833       }
6834 
6835       MIRBuilder.buildMerge(Dst, DstElts);
6836       MI.eraseFromParent();
6837       return Legalized;
6838     }
6839   }
6840 
6841   if (InsertTy.isVector() ||
6842       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6843     return UnableToLegalize;
6844 
6845   const DataLayout &DL = MIRBuilder.getDataLayout();
6846   if ((DstTy.isPointer() &&
6847        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6848       (InsertTy.isPointer() &&
6849        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6850     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6851     return UnableToLegalize;
6852   }
6853 
6854   LLT IntDstTy = DstTy;
6855 
6856   if (!DstTy.isScalar()) {
6857     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6858     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6859   }
6860 
6861   if (!InsertTy.isScalar()) {
6862     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6863     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6864   }
6865 
6866   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6867   if (Offset != 0) {
6868     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6869     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6870   }
6871 
6872   APInt MaskVal = APInt::getBitsSetWithWrap(
6873       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6874 
6875   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6876   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6877   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6878 
6879   MIRBuilder.buildCast(Dst, Or);
6880   MI.eraseFromParent();
6881   return Legalized;
6882 }
6883 
6884 LegalizerHelper::LegalizeResult
6885 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6886   Register Dst0 = MI.getOperand(0).getReg();
6887   Register Dst1 = MI.getOperand(1).getReg();
6888   Register LHS = MI.getOperand(2).getReg();
6889   Register RHS = MI.getOperand(3).getReg();
6890   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6891 
6892   LLT Ty = MRI.getType(Dst0);
6893   LLT BoolTy = MRI.getType(Dst1);
6894 
6895   if (IsAdd)
6896     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6897   else
6898     MIRBuilder.buildSub(Dst0, LHS, RHS);
6899 
6900   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6901 
6902   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6903 
6904   // For an addition, the result should be less than one of the operands (LHS)
6905   // if and only if the other operand (RHS) is negative, otherwise there will
6906   // be overflow.
6907   // For a subtraction, the result should be less than one of the operands
6908   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6909   // otherwise there will be overflow.
6910   auto ResultLowerThanLHS =
6911       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6912   auto ConditionRHS = MIRBuilder.buildICmp(
6913       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6914 
6915   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6916   MI.eraseFromParent();
6917   return Legalized;
6918 }
6919 
6920 LegalizerHelper::LegalizeResult
6921 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6922   Register Res = MI.getOperand(0).getReg();
6923   Register LHS = MI.getOperand(1).getReg();
6924   Register RHS = MI.getOperand(2).getReg();
6925   LLT Ty = MRI.getType(Res);
6926   bool IsSigned;
6927   bool IsAdd;
6928   unsigned BaseOp;
6929   switch (MI.getOpcode()) {
6930   default:
6931     llvm_unreachable("unexpected addsat/subsat opcode");
6932   case TargetOpcode::G_UADDSAT:
6933     IsSigned = false;
6934     IsAdd = true;
6935     BaseOp = TargetOpcode::G_ADD;
6936     break;
6937   case TargetOpcode::G_SADDSAT:
6938     IsSigned = true;
6939     IsAdd = true;
6940     BaseOp = TargetOpcode::G_ADD;
6941     break;
6942   case TargetOpcode::G_USUBSAT:
6943     IsSigned = false;
6944     IsAdd = false;
6945     BaseOp = TargetOpcode::G_SUB;
6946     break;
6947   case TargetOpcode::G_SSUBSAT:
6948     IsSigned = true;
6949     IsAdd = false;
6950     BaseOp = TargetOpcode::G_SUB;
6951     break;
6952   }
6953 
6954   if (IsSigned) {
6955     // sadd.sat(a, b) ->
6956     //   hi = 0x7fffffff - smax(a, 0)
6957     //   lo = 0x80000000 - smin(a, 0)
6958     //   a + smin(smax(lo, b), hi)
6959     // ssub.sat(a, b) ->
6960     //   lo = smax(a, -1) - 0x7fffffff
6961     //   hi = smin(a, -1) - 0x80000000
6962     //   a - smin(smax(lo, b), hi)
6963     // TODO: AMDGPU can use a "median of 3" instruction here:
6964     //   a +/- med3(lo, b, hi)
6965     uint64_t NumBits = Ty.getScalarSizeInBits();
6966     auto MaxVal =
6967         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
6968     auto MinVal =
6969         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
6970     MachineInstrBuilder Hi, Lo;
6971     if (IsAdd) {
6972       auto Zero = MIRBuilder.buildConstant(Ty, 0);
6973       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
6974       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
6975     } else {
6976       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
6977       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
6978                                MaxVal);
6979       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
6980                                MinVal);
6981     }
6982     auto RHSClamped =
6983         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
6984     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
6985   } else {
6986     // uadd.sat(a, b) -> a + umin(~a, b)
6987     // usub.sat(a, b) -> a - umin(a, b)
6988     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
6989     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
6990     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
6991   }
6992 
6993   MI.eraseFromParent();
6994   return Legalized;
6995 }
6996 
6997 LegalizerHelper::LegalizeResult
6998 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
6999   Register Res = MI.getOperand(0).getReg();
7000   Register LHS = MI.getOperand(1).getReg();
7001   Register RHS = MI.getOperand(2).getReg();
7002   LLT Ty = MRI.getType(Res);
7003   LLT BoolTy = Ty.changeElementSize(1);
7004   bool IsSigned;
7005   bool IsAdd;
7006   unsigned OverflowOp;
7007   switch (MI.getOpcode()) {
7008   default:
7009     llvm_unreachable("unexpected addsat/subsat opcode");
7010   case TargetOpcode::G_UADDSAT:
7011     IsSigned = false;
7012     IsAdd = true;
7013     OverflowOp = TargetOpcode::G_UADDO;
7014     break;
7015   case TargetOpcode::G_SADDSAT:
7016     IsSigned = true;
7017     IsAdd = true;
7018     OverflowOp = TargetOpcode::G_SADDO;
7019     break;
7020   case TargetOpcode::G_USUBSAT:
7021     IsSigned = false;
7022     IsAdd = false;
7023     OverflowOp = TargetOpcode::G_USUBO;
7024     break;
7025   case TargetOpcode::G_SSUBSAT:
7026     IsSigned = true;
7027     IsAdd = false;
7028     OverflowOp = TargetOpcode::G_SSUBO;
7029     break;
7030   }
7031 
7032   auto OverflowRes =
7033       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7034   Register Tmp = OverflowRes.getReg(0);
7035   Register Ov = OverflowRes.getReg(1);
7036   MachineInstrBuilder Clamp;
7037   if (IsSigned) {
7038     // sadd.sat(a, b) ->
7039     //   {tmp, ov} = saddo(a, b)
7040     //   ov ? (tmp >>s 31) + 0x80000000 : r
7041     // ssub.sat(a, b) ->
7042     //   {tmp, ov} = ssubo(a, b)
7043     //   ov ? (tmp >>s 31) + 0x80000000 : r
7044     uint64_t NumBits = Ty.getScalarSizeInBits();
7045     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7046     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7047     auto MinVal =
7048         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7049     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7050   } else {
7051     // uadd.sat(a, b) ->
7052     //   {tmp, ov} = uaddo(a, b)
7053     //   ov ? 0xffffffff : tmp
7054     // usub.sat(a, b) ->
7055     //   {tmp, ov} = usubo(a, b)
7056     //   ov ? 0 : tmp
7057     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7058   }
7059   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7060 
7061   MI.eraseFromParent();
7062   return Legalized;
7063 }
7064 
7065 LegalizerHelper::LegalizeResult
7066 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7067   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7068           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7069          "Expected shlsat opcode!");
7070   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7071   Register Res = MI.getOperand(0).getReg();
7072   Register LHS = MI.getOperand(1).getReg();
7073   Register RHS = MI.getOperand(2).getReg();
7074   LLT Ty = MRI.getType(Res);
7075   LLT BoolTy = Ty.changeElementSize(1);
7076 
7077   unsigned BW = Ty.getScalarSizeInBits();
7078   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7079   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7080                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7081 
7082   MachineInstrBuilder SatVal;
7083   if (IsSigned) {
7084     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7085     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7086     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7087                                     MIRBuilder.buildConstant(Ty, 0));
7088     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7089   } else {
7090     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7091   }
7092   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7093   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7094 
7095   MI.eraseFromParent();
7096   return Legalized;
7097 }
7098 
7099 LegalizerHelper::LegalizeResult
7100 LegalizerHelper::lowerBswap(MachineInstr &MI) {
7101   Register Dst = MI.getOperand(0).getReg();
7102   Register Src = MI.getOperand(1).getReg();
7103   const LLT Ty = MRI.getType(Src);
7104   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7105   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7106 
7107   // Swap most and least significant byte, set remaining bytes in Res to zero.
7108   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7109   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7110   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7111   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7112 
7113   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7114   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7115     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7116     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7117     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7118     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7119     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7120     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7121     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7122     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7123     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7124     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7125     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7126     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7127   }
7128   Res.getInstr()->getOperand(0).setReg(Dst);
7129 
7130   MI.eraseFromParent();
7131   return Legalized;
7132 }
7133 
7134 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7135 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7136                                  MachineInstrBuilder Src, APInt Mask) {
7137   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7138   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7139   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7140   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7141   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7142   return B.buildOr(Dst, LHS, RHS);
7143 }
7144 
7145 LegalizerHelper::LegalizeResult
7146 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7147   Register Dst = MI.getOperand(0).getReg();
7148   Register Src = MI.getOperand(1).getReg();
7149   const LLT Ty = MRI.getType(Src);
7150   unsigned Size = Ty.getSizeInBits();
7151 
7152   MachineInstrBuilder BSWAP =
7153       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7154 
7155   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7156   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7157   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7158   MachineInstrBuilder Swap4 =
7159       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7160 
7161   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7162   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7163   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7164   MachineInstrBuilder Swap2 =
7165       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7166 
7167   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7168   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7169   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7170   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7171 
7172   MI.eraseFromParent();
7173   return Legalized;
7174 }
7175 
7176 LegalizerHelper::LegalizeResult
7177 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7178   MachineFunction &MF = MIRBuilder.getMF();
7179 
7180   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7181   int NameOpIdx = IsRead ? 1 : 0;
7182   int ValRegIndex = IsRead ? 0 : 1;
7183 
7184   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7185   const LLT Ty = MRI.getType(ValReg);
7186   const MDString *RegStr = cast<MDString>(
7187     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7188 
7189   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7190   if (!PhysReg.isValid())
7191     return UnableToLegalize;
7192 
7193   if (IsRead)
7194     MIRBuilder.buildCopy(ValReg, PhysReg);
7195   else
7196     MIRBuilder.buildCopy(PhysReg, ValReg);
7197 
7198   MI.eraseFromParent();
7199   return Legalized;
7200 }
7201 
7202 LegalizerHelper::LegalizeResult
7203 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7204   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7205   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7206   Register Result = MI.getOperand(0).getReg();
7207   LLT OrigTy = MRI.getType(Result);
7208   auto SizeInBits = OrigTy.getScalarSizeInBits();
7209   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7210 
7211   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7212   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7213   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7214   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7215 
7216   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7217   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7218   MIRBuilder.buildTrunc(Result, Shifted);
7219 
7220   MI.eraseFromParent();
7221   return Legalized;
7222 }
7223 
7224 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7225   // Implement vector G_SELECT in terms of XOR, AND, OR.
7226   Register DstReg = MI.getOperand(0).getReg();
7227   Register MaskReg = MI.getOperand(1).getReg();
7228   Register Op1Reg = MI.getOperand(2).getReg();
7229   Register Op2Reg = MI.getOperand(3).getReg();
7230   LLT DstTy = MRI.getType(DstReg);
7231   LLT MaskTy = MRI.getType(MaskReg);
7232   LLT Op1Ty = MRI.getType(Op1Reg);
7233   if (!DstTy.isVector())
7234     return UnableToLegalize;
7235 
7236   // Vector selects can have a scalar predicate. If so, splat into a vector and
7237   // finish for later legalization attempts to try again.
7238   if (MaskTy.isScalar()) {
7239     Register MaskElt = MaskReg;
7240     if (MaskTy.getSizeInBits() < DstTy.getScalarSizeInBits())
7241       MaskElt = MIRBuilder.buildSExt(DstTy.getElementType(), MaskElt).getReg(0);
7242     // Generate a vector splat idiom to be pattern matched later.
7243     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7244     Observer.changingInstr(MI);
7245     MI.getOperand(1).setReg(ShufSplat.getReg(0));
7246     Observer.changedInstr(MI);
7247     return Legalized;
7248   }
7249 
7250   if (MaskTy.getSizeInBits() != Op1Ty.getSizeInBits()) {
7251     return UnableToLegalize;
7252   }
7253 
7254   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7255   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7256   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7257   MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7258   MI.eraseFromParent();
7259   return Legalized;
7260 }
7261 
7262 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7263   // Split DIVREM into individual instructions.
7264   unsigned Opcode = MI.getOpcode();
7265 
7266   MIRBuilder.buildInstr(
7267       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7268                                         : TargetOpcode::G_UDIV,
7269       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7270   MIRBuilder.buildInstr(
7271       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7272                                         : TargetOpcode::G_UREM,
7273       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7274   MI.eraseFromParent();
7275   return Legalized;
7276 }
7277 
7278 LegalizerHelper::LegalizeResult
7279 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7280   // Expand %res = G_ABS %a into:
7281   // %v1 = G_ASHR %a, scalar_size-1
7282   // %v2 = G_ADD %a, %v1
7283   // %res = G_XOR %v2, %v1
7284   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7285   Register OpReg = MI.getOperand(1).getReg();
7286   auto ShiftAmt =
7287       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7288   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7289   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7290   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7291   MI.eraseFromParent();
7292   return Legalized;
7293 }
7294 
7295 LegalizerHelper::LegalizeResult
7296 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7297   // Expand %res = G_ABS %a into:
7298   // %v1 = G_CONSTANT 0
7299   // %v2 = G_SUB %v1, %a
7300   // %res = G_SMAX %a, %v2
7301   Register SrcReg = MI.getOperand(1).getReg();
7302   LLT Ty = MRI.getType(SrcReg);
7303   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7304   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7305   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7306   MI.eraseFromParent();
7307   return Legalized;
7308 }
7309 
7310 LegalizerHelper::LegalizeResult
7311 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7312   Register SrcReg = MI.getOperand(1).getReg();
7313   LLT SrcTy = MRI.getType(SrcReg);
7314   LLT DstTy = MRI.getType(SrcReg);
7315 
7316   // The source could be a scalar if the IR type was <1 x sN>.
7317   if (SrcTy.isScalar()) {
7318     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7319       return UnableToLegalize; // FIXME: handle extension.
7320     // This can be just a plain copy.
7321     Observer.changingInstr(MI);
7322     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7323     Observer.changedInstr(MI);
7324     return Legalized;
7325   }
7326   return UnableToLegalize;;
7327 }
7328 
7329 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7330   // On Darwin, -Os means optimize for size without hurting performance, so
7331   // only really optimize for size when -Oz (MinSize) is used.
7332   if (MF.getTarget().getTargetTriple().isOSDarwin())
7333     return MF.getFunction().hasMinSize();
7334   return MF.getFunction().hasOptSize();
7335 }
7336 
7337 // Returns a list of types to use for memory op lowering in MemOps. A partial
7338 // port of findOptimalMemOpLowering in TargetLowering.
7339 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7340                                           unsigned Limit, const MemOp &Op,
7341                                           unsigned DstAS, unsigned SrcAS,
7342                                           const AttributeList &FuncAttributes,
7343                                           const TargetLowering &TLI) {
7344   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7345     return false;
7346 
7347   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7348 
7349   if (Ty == LLT()) {
7350     // Use the largest scalar type whose alignment constraints are satisfied.
7351     // We only need to check DstAlign here as SrcAlign is always greater or
7352     // equal to DstAlign (or zero).
7353     Ty = LLT::scalar(64);
7354     if (Op.isFixedDstAlign())
7355       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7356              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7357         Ty = LLT::scalar(Ty.getSizeInBytes());
7358     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7359     // FIXME: check for the largest legal type we can load/store to.
7360   }
7361 
7362   unsigned NumMemOps = 0;
7363   uint64_t Size = Op.size();
7364   while (Size) {
7365     unsigned TySize = Ty.getSizeInBytes();
7366     while (TySize > Size) {
7367       // For now, only use non-vector load / store's for the left-over pieces.
7368       LLT NewTy = Ty;
7369       // FIXME: check for mem op safety and legality of the types. Not all of
7370       // SDAGisms map cleanly to GISel concepts.
7371       if (NewTy.isVector())
7372         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7373       NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
7374       unsigned NewTySize = NewTy.getSizeInBytes();
7375       assert(NewTySize > 0 && "Could not find appropriate type");
7376 
7377       // If the new LLT cannot cover all of the remaining bits, then consider
7378       // issuing a (or a pair of) unaligned and overlapping load / store.
7379       bool Fast;
7380       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7381       MVT VT = getMVTForLLT(Ty);
7382       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7383           TLI.allowsMisalignedMemoryAccesses(
7384               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7385               MachineMemOperand::MONone, &Fast) &&
7386           Fast)
7387         TySize = Size;
7388       else {
7389         Ty = NewTy;
7390         TySize = NewTySize;
7391       }
7392     }
7393 
7394     if (++NumMemOps > Limit)
7395       return false;
7396 
7397     MemOps.push_back(Ty);
7398     Size -= TySize;
7399   }
7400 
7401   return true;
7402 }
7403 
7404 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7405   if (Ty.isVector())
7406     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7407                                 Ty.getNumElements());
7408   return IntegerType::get(C, Ty.getSizeInBits());
7409 }
7410 
7411 // Get a vectorized representation of the memset value operand, GISel edition.
7412 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7413   MachineRegisterInfo &MRI = *MIB.getMRI();
7414   unsigned NumBits = Ty.getScalarSizeInBits();
7415   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7416   if (!Ty.isVector() && ValVRegAndVal) {
7417     APInt Scalar = ValVRegAndVal->Value.truncOrSelf(8);
7418     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7419     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7420   }
7421 
7422   // Extend the byte value to the larger type, and then multiply by a magic
7423   // value 0x010101... in order to replicate it across every byte.
7424   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7425   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7426     return MIB.buildConstant(Ty, 0).getReg(0);
7427   }
7428 
7429   LLT ExtType = Ty.getScalarType();
7430   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7431   if (NumBits > 8) {
7432     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7433     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7434     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7435   }
7436 
7437   // For vector types create a G_BUILD_VECTOR.
7438   if (Ty.isVector())
7439     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7440 
7441   return Val;
7442 }
7443 
7444 LegalizerHelper::LegalizeResult
7445 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7446                              uint64_t KnownLen, Align Alignment,
7447                              bool IsVolatile) {
7448   auto &MF = *MI.getParent()->getParent();
7449   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7450   auto &DL = MF.getDataLayout();
7451   LLVMContext &C = MF.getFunction().getContext();
7452 
7453   assert(KnownLen != 0 && "Have a zero length memset length!");
7454 
7455   bool DstAlignCanChange = false;
7456   MachineFrameInfo &MFI = MF.getFrameInfo();
7457   bool OptSize = shouldLowerMemFuncForSize(MF);
7458 
7459   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7460   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7461     DstAlignCanChange = true;
7462 
7463   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7464   std::vector<LLT> MemOps;
7465 
7466   const auto &DstMMO = **MI.memoperands_begin();
7467   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7468 
7469   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7470   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7471 
7472   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7473                                      MemOp::Set(KnownLen, DstAlignCanChange,
7474                                                 Alignment,
7475                                                 /*IsZeroMemset=*/IsZeroVal,
7476                                                 /*IsVolatile=*/IsVolatile),
7477                                      DstPtrInfo.getAddrSpace(), ~0u,
7478                                      MF.getFunction().getAttributes(), TLI))
7479     return UnableToLegalize;
7480 
7481   if (DstAlignCanChange) {
7482     // Get an estimate of the type from the LLT.
7483     Type *IRTy = getTypeForLLT(MemOps[0], C);
7484     Align NewAlign = DL.getABITypeAlign(IRTy);
7485     if (NewAlign > Alignment) {
7486       Alignment = NewAlign;
7487       unsigned FI = FIDef->getOperand(1).getIndex();
7488       // Give the stack frame object a larger alignment if needed.
7489       if (MFI.getObjectAlign(FI) < Alignment)
7490         MFI.setObjectAlignment(FI, Alignment);
7491     }
7492   }
7493 
7494   MachineIRBuilder MIB(MI);
7495   // Find the largest store and generate the bit pattern for it.
7496   LLT LargestTy = MemOps[0];
7497   for (unsigned i = 1; i < MemOps.size(); i++)
7498     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7499       LargestTy = MemOps[i];
7500 
7501   // The memset stored value is always defined as an s8, so in order to make it
7502   // work with larger store types we need to repeat the bit pattern across the
7503   // wider type.
7504   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7505 
7506   if (!MemSetValue)
7507     return UnableToLegalize;
7508 
7509   // Generate the stores. For each store type in the list, we generate the
7510   // matching store of that type to the destination address.
7511   LLT PtrTy = MRI.getType(Dst);
7512   unsigned DstOff = 0;
7513   unsigned Size = KnownLen;
7514   for (unsigned I = 0; I < MemOps.size(); I++) {
7515     LLT Ty = MemOps[I];
7516     unsigned TySize = Ty.getSizeInBytes();
7517     if (TySize > Size) {
7518       // Issuing an unaligned load / store pair that overlaps with the previous
7519       // pair. Adjust the offset accordingly.
7520       assert(I == MemOps.size() - 1 && I != 0);
7521       DstOff -= TySize - Size;
7522     }
7523 
7524     // If this store is smaller than the largest store see whether we can get
7525     // the smaller value for free with a truncate.
7526     Register Value = MemSetValue;
7527     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7528       MVT VT = getMVTForLLT(Ty);
7529       MVT LargestVT = getMVTForLLT(LargestTy);
7530       if (!LargestTy.isVector() && !Ty.isVector() &&
7531           TLI.isTruncateFree(LargestVT, VT))
7532         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7533       else
7534         Value = getMemsetValue(Val, Ty, MIB);
7535       if (!Value)
7536         return UnableToLegalize;
7537     }
7538 
7539     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7540 
7541     Register Ptr = Dst;
7542     if (DstOff != 0) {
7543       auto Offset =
7544           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7545       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7546     }
7547 
7548     MIB.buildStore(Value, Ptr, *StoreMMO);
7549     DstOff += Ty.getSizeInBytes();
7550     Size -= TySize;
7551   }
7552 
7553   MI.eraseFromParent();
7554   return Legalized;
7555 }
7556 
7557 LegalizerHelper::LegalizeResult
7558 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7559   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7560 
7561   Register Dst = MI.getOperand(0).getReg();
7562   Register Src = MI.getOperand(1).getReg();
7563   Register Len = MI.getOperand(2).getReg();
7564 
7565   const auto *MMOIt = MI.memoperands_begin();
7566   const MachineMemOperand *MemOp = *MMOIt;
7567   bool IsVolatile = MemOp->isVolatile();
7568 
7569   // See if this is a constant length copy
7570   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7571   // FIXME: support dynamically sized G_MEMCPY_INLINE
7572   assert(LenVRegAndVal.hasValue() &&
7573          "inline memcpy with dynamic size is not yet supported");
7574   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7575   if (KnownLen == 0) {
7576     MI.eraseFromParent();
7577     return Legalized;
7578   }
7579 
7580   const auto &DstMMO = **MI.memoperands_begin();
7581   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7582   Align DstAlign = DstMMO.getBaseAlign();
7583   Align SrcAlign = SrcMMO.getBaseAlign();
7584 
7585   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7586                            IsVolatile);
7587 }
7588 
7589 LegalizerHelper::LegalizeResult
7590 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7591                                    uint64_t KnownLen, Align DstAlign,
7592                                    Align SrcAlign, bool IsVolatile) {
7593   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7594   return lowerMemcpy(MI, Dst, Src, KnownLen,
7595                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7596                      IsVolatile);
7597 }
7598 
7599 LegalizerHelper::LegalizeResult
7600 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7601                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7602                              Align SrcAlign, bool IsVolatile) {
7603   auto &MF = *MI.getParent()->getParent();
7604   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7605   auto &DL = MF.getDataLayout();
7606   LLVMContext &C = MF.getFunction().getContext();
7607 
7608   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7609 
7610   bool DstAlignCanChange = false;
7611   MachineFrameInfo &MFI = MF.getFrameInfo();
7612   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7613 
7614   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7615   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7616     DstAlignCanChange = true;
7617 
7618   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7619   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7620   // if the memcpy is in a tail call position.
7621 
7622   std::vector<LLT> MemOps;
7623 
7624   const auto &DstMMO = **MI.memoperands_begin();
7625   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7626   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7627   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7628 
7629   if (!findGISelOptimalMemOpLowering(
7630           MemOps, Limit,
7631           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7632                       IsVolatile),
7633           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7634           MF.getFunction().getAttributes(), TLI))
7635     return UnableToLegalize;
7636 
7637   if (DstAlignCanChange) {
7638     // Get an estimate of the type from the LLT.
7639     Type *IRTy = getTypeForLLT(MemOps[0], C);
7640     Align NewAlign = DL.getABITypeAlign(IRTy);
7641 
7642     // Don't promote to an alignment that would require dynamic stack
7643     // realignment.
7644     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7645     if (!TRI->hasStackRealignment(MF))
7646       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7647         NewAlign = NewAlign / 2;
7648 
7649     if (NewAlign > Alignment) {
7650       Alignment = NewAlign;
7651       unsigned FI = FIDef->getOperand(1).getIndex();
7652       // Give the stack frame object a larger alignment if needed.
7653       if (MFI.getObjectAlign(FI) < Alignment)
7654         MFI.setObjectAlignment(FI, Alignment);
7655     }
7656   }
7657 
7658   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7659 
7660   MachineIRBuilder MIB(MI);
7661   // Now we need to emit a pair of load and stores for each of the types we've
7662   // collected. I.e. for each type, generate a load from the source pointer of
7663   // that type width, and then generate a corresponding store to the dest buffer
7664   // of that value loaded. This can result in a sequence of loads and stores
7665   // mixed types, depending on what the target specifies as good types to use.
7666   unsigned CurrOffset = 0;
7667   unsigned Size = KnownLen;
7668   for (auto CopyTy : MemOps) {
7669     // Issuing an unaligned load / store pair  that overlaps with the previous
7670     // pair. Adjust the offset accordingly.
7671     if (CopyTy.getSizeInBytes() > Size)
7672       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7673 
7674     // Construct MMOs for the accesses.
7675     auto *LoadMMO =
7676         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7677     auto *StoreMMO =
7678         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7679 
7680     // Create the load.
7681     Register LoadPtr = Src;
7682     Register Offset;
7683     if (CurrOffset != 0) {
7684       LLT SrcTy = MRI.getType(Src);
7685       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7686                    .getReg(0);
7687       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7688     }
7689     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7690 
7691     // Create the store.
7692     Register StorePtr = Dst;
7693     if (CurrOffset != 0) {
7694       LLT DstTy = MRI.getType(Dst);
7695       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7696     }
7697     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7698     CurrOffset += CopyTy.getSizeInBytes();
7699     Size -= CopyTy.getSizeInBytes();
7700   }
7701 
7702   MI.eraseFromParent();
7703   return Legalized;
7704 }
7705 
7706 LegalizerHelper::LegalizeResult
7707 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7708                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7709                               bool IsVolatile) {
7710   auto &MF = *MI.getParent()->getParent();
7711   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7712   auto &DL = MF.getDataLayout();
7713   LLVMContext &C = MF.getFunction().getContext();
7714 
7715   assert(KnownLen != 0 && "Have a zero length memmove length!");
7716 
7717   bool DstAlignCanChange = false;
7718   MachineFrameInfo &MFI = MF.getFrameInfo();
7719   bool OptSize = shouldLowerMemFuncForSize(MF);
7720   Align Alignment = commonAlignment(DstAlign, SrcAlign);
7721 
7722   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7723   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7724     DstAlignCanChange = true;
7725 
7726   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7727   std::vector<LLT> MemOps;
7728 
7729   const auto &DstMMO = **MI.memoperands_begin();
7730   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7731   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7732   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7733 
7734   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7735   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7736   // same thing here.
7737   if (!findGISelOptimalMemOpLowering(
7738           MemOps, Limit,
7739           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7740                       /*IsVolatile*/ true),
7741           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7742           MF.getFunction().getAttributes(), TLI))
7743     return UnableToLegalize;
7744 
7745   if (DstAlignCanChange) {
7746     // Get an estimate of the type from the LLT.
7747     Type *IRTy = getTypeForLLT(MemOps[0], C);
7748     Align NewAlign = DL.getABITypeAlign(IRTy);
7749 
7750     // Don't promote to an alignment that would require dynamic stack
7751     // realignment.
7752     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7753     if (!TRI->hasStackRealignment(MF))
7754       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7755         NewAlign = NewAlign / 2;
7756 
7757     if (NewAlign > Alignment) {
7758       Alignment = NewAlign;
7759       unsigned FI = FIDef->getOperand(1).getIndex();
7760       // Give the stack frame object a larger alignment if needed.
7761       if (MFI.getObjectAlign(FI) < Alignment)
7762         MFI.setObjectAlignment(FI, Alignment);
7763     }
7764   }
7765 
7766   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
7767 
7768   MachineIRBuilder MIB(MI);
7769   // Memmove requires that we perform the loads first before issuing the stores.
7770   // Apart from that, this loop is pretty much doing the same thing as the
7771   // memcpy codegen function.
7772   unsigned CurrOffset = 0;
7773   SmallVector<Register, 16> LoadVals;
7774   for (auto CopyTy : MemOps) {
7775     // Construct MMO for the load.
7776     auto *LoadMMO =
7777         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7778 
7779     // Create the load.
7780     Register LoadPtr = Src;
7781     if (CurrOffset != 0) {
7782       LLT SrcTy = MRI.getType(Src);
7783       auto Offset =
7784           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
7785       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7786     }
7787     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
7788     CurrOffset += CopyTy.getSizeInBytes();
7789   }
7790 
7791   CurrOffset = 0;
7792   for (unsigned I = 0; I < MemOps.size(); ++I) {
7793     LLT CopyTy = MemOps[I];
7794     // Now store the values loaded.
7795     auto *StoreMMO =
7796         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7797 
7798     Register StorePtr = Dst;
7799     if (CurrOffset != 0) {
7800       LLT DstTy = MRI.getType(Dst);
7801       auto Offset =
7802           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
7803       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7804     }
7805     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
7806     CurrOffset += CopyTy.getSizeInBytes();
7807   }
7808   MI.eraseFromParent();
7809   return Legalized;
7810 }
7811 
7812 LegalizerHelper::LegalizeResult
7813 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
7814   const unsigned Opc = MI.getOpcode();
7815   // This combine is fairly complex so it's not written with a separate
7816   // matcher function.
7817   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
7818           Opc == TargetOpcode::G_MEMSET) &&
7819          "Expected memcpy like instruction");
7820 
7821   auto MMOIt = MI.memoperands_begin();
7822   const MachineMemOperand *MemOp = *MMOIt;
7823 
7824   Align DstAlign = MemOp->getBaseAlign();
7825   Align SrcAlign;
7826   Register Dst = MI.getOperand(0).getReg();
7827   Register Src = MI.getOperand(1).getReg();
7828   Register Len = MI.getOperand(2).getReg();
7829 
7830   if (Opc != TargetOpcode::G_MEMSET) {
7831     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
7832     MemOp = *(++MMOIt);
7833     SrcAlign = MemOp->getBaseAlign();
7834   }
7835 
7836   // See if this is a constant length copy
7837   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7838   if (!LenVRegAndVal)
7839     return UnableToLegalize;
7840   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7841 
7842   if (KnownLen == 0) {
7843     MI.eraseFromParent();
7844     return Legalized;
7845   }
7846 
7847   bool IsVolatile = MemOp->isVolatile();
7848   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
7849     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7850                              IsVolatile);
7851 
7852   // Don't try to optimize volatile.
7853   if (IsVolatile)
7854     return UnableToLegalize;
7855 
7856   if (MaxLen && KnownLen > MaxLen)
7857     return UnableToLegalize;
7858 
7859   if (Opc == TargetOpcode::G_MEMCPY) {
7860     auto &MF = *MI.getParent()->getParent();
7861     const auto &TLI = *MF.getSubtarget().getTargetLowering();
7862     bool OptSize = shouldLowerMemFuncForSize(MF);
7863     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
7864     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
7865                        IsVolatile);
7866   }
7867   if (Opc == TargetOpcode::G_MEMMOVE)
7868     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
7869   if (Opc == TargetOpcode::G_MEMSET)
7870     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
7871   return UnableToLegalize;
7872 }
7873