xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineConstantPool.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
29 #include "llvm/CodeGen/TargetInstrInfo.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/TargetOpcodes.h"
32 #include "llvm/CodeGen/TargetSubtargetInfo.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/MathExtras.h"
36 #include "llvm/Support/raw_ostream.h"
37 #include "llvm/Target/TargetMachine.h"
38 #include <numeric>
39 #include <optional>
40 
41 #define DEBUG_TYPE "legalizer"
42 
43 using namespace llvm;
44 using namespace LegalizeActions;
45 using namespace MIPatternMatch;
46 
47 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
48 ///
49 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
50 /// with any leftover piece as type \p LeftoverTy
51 ///
52 /// Returns -1 in the first element of the pair if the breakdown is not
53 /// satisfiable.
54 static std::pair<int, int>
55 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
56   assert(!LeftoverTy.isValid() && "this is an out argument");
57 
58   unsigned Size = OrigTy.getSizeInBits();
59   unsigned NarrowSize = NarrowTy.getSizeInBits();
60   unsigned NumParts = Size / NarrowSize;
61   unsigned LeftoverSize = Size - NumParts * NarrowSize;
62   assert(Size > NarrowSize);
63 
64   if (LeftoverSize == 0)
65     return {NumParts, 0};
66 
67   if (NarrowTy.isVector()) {
68     unsigned EltSize = OrigTy.getScalarSizeInBits();
69     if (LeftoverSize % EltSize != 0)
70       return {-1, -1};
71     LeftoverTy = LLT::scalarOrVector(
72         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
73   } else {
74     LeftoverTy = LLT::scalar(LeftoverSize);
75   }
76 
77   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
78   return std::make_pair(NumParts, NumLeftover);
79 }
80 
81 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
82 
83   if (!Ty.isScalar())
84     return nullptr;
85 
86   switch (Ty.getSizeInBits()) {
87   case 16:
88     return Type::getHalfTy(Ctx);
89   case 32:
90     return Type::getFloatTy(Ctx);
91   case 64:
92     return Type::getDoubleTy(Ctx);
93   case 80:
94     return Type::getX86_FP80Ty(Ctx);
95   case 128:
96     return Type::getFP128Ty(Ctx);
97   default:
98     return nullptr;
99   }
100 }
101 
102 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
103                                  GISelChangeObserver &Observer,
104                                  MachineIRBuilder &Builder)
105     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
106       LI(*MF.getSubtarget().getLegalizerInfo()),
107       TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
108 
109 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
110                                  GISelChangeObserver &Observer,
111                                  MachineIRBuilder &B, GISelKnownBits *KB)
112     : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
113       TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
114 
115 LegalizerHelper::LegalizeResult
116 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
117                                    LostDebugLocObserver &LocObserver) {
118   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
119 
120   MIRBuilder.setInstrAndDebugLoc(MI);
121 
122   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
123       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
124     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
125   auto Step = LI.getAction(MI, MRI);
126   switch (Step.Action) {
127   case Legal:
128     LLVM_DEBUG(dbgs() << ".. Already legal\n");
129     return AlreadyLegal;
130   case Libcall:
131     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132     return libcall(MI, LocObserver);
133   case NarrowScalar:
134     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
136   case WidenScalar:
137     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138     return widenScalar(MI, Step.TypeIdx, Step.NewType);
139   case Bitcast:
140     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141     return bitcast(MI, Step.TypeIdx, Step.NewType);
142   case Lower:
143     LLVM_DEBUG(dbgs() << ".. Lower\n");
144     return lower(MI, Step.TypeIdx, Step.NewType);
145   case FewerElements:
146     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
148   case MoreElements:
149     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
151   case Custom:
152     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153     return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
154   default:
155     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
156     return UnableToLegalize;
157   }
158 }
159 
160 void LegalizerHelper::extractParts(Register Reg, LLT Ty, int NumParts,
161                                    SmallVectorImpl<Register> &VRegs) {
162   for (int i = 0; i < NumParts; ++i)
163     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
164   MIRBuilder.buildUnmerge(VRegs, Reg);
165 }
166 
167 bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
168                                    LLT MainTy, LLT &LeftoverTy,
169                                    SmallVectorImpl<Register> &VRegs,
170                                    SmallVectorImpl<Register> &LeftoverRegs) {
171   assert(!LeftoverTy.isValid() && "this is an out argument");
172 
173   unsigned RegSize = RegTy.getSizeInBits();
174   unsigned MainSize = MainTy.getSizeInBits();
175   unsigned NumParts = RegSize / MainSize;
176   unsigned LeftoverSize = RegSize - NumParts * MainSize;
177 
178   // Use an unmerge when possible.
179   if (LeftoverSize == 0) {
180     for (unsigned I = 0; I < NumParts; ++I)
181       VRegs.push_back(MRI.createGenericVirtualRegister(MainTy));
182     MIRBuilder.buildUnmerge(VRegs, Reg);
183     return true;
184   }
185 
186   // Perform irregular split. Leftover is last element of RegPieces.
187   if (MainTy.isVector()) {
188     SmallVector<Register, 8> RegPieces;
189     extractVectorParts(Reg, MainTy.getNumElements(), RegPieces);
190     for (unsigned i = 0; i < RegPieces.size() - 1; ++i)
191       VRegs.push_back(RegPieces[i]);
192     LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]);
193     LeftoverTy = MRI.getType(LeftoverRegs[0]);
194     return true;
195   }
196 
197   LeftoverTy = LLT::scalar(LeftoverSize);
198   // For irregular sizes, extract the individual parts.
199   for (unsigned I = 0; I != NumParts; ++I) {
200     Register NewReg = MRI.createGenericVirtualRegister(MainTy);
201     VRegs.push_back(NewReg);
202     MIRBuilder.buildExtract(NewReg, Reg, MainSize * I);
203   }
204 
205   for (unsigned Offset = MainSize * NumParts; Offset < RegSize;
206        Offset += LeftoverSize) {
207     Register NewReg = MRI.createGenericVirtualRegister(LeftoverTy);
208     LeftoverRegs.push_back(NewReg);
209     MIRBuilder.buildExtract(NewReg, Reg, Offset);
210   }
211 
212   return true;
213 }
214 
215 void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts,
216                                          SmallVectorImpl<Register> &VRegs) {
217   LLT RegTy = MRI.getType(Reg);
218   assert(RegTy.isVector() && "Expected a vector type");
219 
220   LLT EltTy = RegTy.getElementType();
221   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
222   unsigned RegNumElts = RegTy.getNumElements();
223   unsigned LeftoverNumElts = RegNumElts % NumElts;
224   unsigned NumNarrowTyPieces = RegNumElts / NumElts;
225 
226   // Perfect split without leftover
227   if (LeftoverNumElts == 0)
228     return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs);
229 
230   // Irregular split. Provide direct access to all elements for artifact
231   // combiner using unmerge to elements. Then build vectors with NumElts
232   // elements. Remaining element(s) will be (used to build vector) Leftover.
233   SmallVector<Register, 8> Elts;
234   extractParts(Reg, EltTy, RegNumElts, Elts);
235 
236   unsigned Offset = 0;
237   // Requested sub-vectors of NarrowTy.
238   for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) {
239     ArrayRef<Register> Pieces(&Elts[Offset], NumElts);
240     VRegs.push_back(MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
241   }
242 
243   // Leftover element(s).
244   if (LeftoverNumElts == 1) {
245     VRegs.push_back(Elts[Offset]);
246   } else {
247     LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy);
248     ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts);
249     VRegs.push_back(
250         MIRBuilder.buildMergeLikeInstr(LeftoverTy, Pieces).getReg(0));
251   }
252 }
253 
254 void LegalizerHelper::insertParts(Register DstReg,
255                                   LLT ResultTy, LLT PartTy,
256                                   ArrayRef<Register> PartRegs,
257                                   LLT LeftoverTy,
258                                   ArrayRef<Register> LeftoverRegs) {
259   if (!LeftoverTy.isValid()) {
260     assert(LeftoverRegs.empty());
261 
262     if (!ResultTy.isVector()) {
263       MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
264       return;
265     }
266 
267     if (PartTy.isVector())
268       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
269     else
270       MIRBuilder.buildBuildVector(DstReg, PartRegs);
271     return;
272   }
273 
274   // Merge sub-vectors with different number of elements and insert into DstReg.
275   if (ResultTy.isVector()) {
276     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
277     SmallVector<Register, 8> AllRegs;
278     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
279       AllRegs.push_back(Reg);
280     return mergeMixedSubvectors(DstReg, AllRegs);
281   }
282 
283   SmallVector<Register> GCDRegs;
284   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
285   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
286     extractGCDType(GCDRegs, GCDTy, PartReg);
287   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
288   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
289 }
290 
291 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
292                                        Register Reg) {
293   LLT Ty = MRI.getType(Reg);
294   SmallVector<Register, 8> RegElts;
295   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts);
296   Elts.append(RegElts);
297 }
298 
299 /// Merge \p PartRegs with different types into \p DstReg.
300 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
301                                            ArrayRef<Register> PartRegs) {
302   SmallVector<Register, 8> AllElts;
303   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
304     appendVectorElts(AllElts, PartRegs[i]);
305 
306   Register Leftover = PartRegs[PartRegs.size() - 1];
307   if (MRI.getType(Leftover).isScalar())
308     AllElts.push_back(Leftover);
309   else
310     appendVectorElts(AllElts, Leftover);
311 
312   MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
313 }
314 
315 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
316 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
317                               const MachineInstr &MI) {
318   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
319 
320   const int StartIdx = Regs.size();
321   const int NumResults = MI.getNumOperands() - 1;
322   Regs.resize(Regs.size() + NumResults);
323   for (int I = 0; I != NumResults; ++I)
324     Regs[StartIdx + I] = MI.getOperand(I).getReg();
325 }
326 
327 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
328                                      LLT GCDTy, Register SrcReg) {
329   LLT SrcTy = MRI.getType(SrcReg);
330   if (SrcTy == GCDTy) {
331     // If the source already evenly divides the result type, we don't need to do
332     // anything.
333     Parts.push_back(SrcReg);
334   } else {
335     // Need to split into common type sized pieces.
336     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
337     getUnmergeResults(Parts, *Unmerge);
338   }
339 }
340 
341 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
342                                     LLT NarrowTy, Register SrcReg) {
343   LLT SrcTy = MRI.getType(SrcReg);
344   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
345   extractGCDType(Parts, GCDTy, SrcReg);
346   return GCDTy;
347 }
348 
349 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
350                                          SmallVectorImpl<Register> &VRegs,
351                                          unsigned PadStrategy) {
352   LLT LCMTy = getLCMType(DstTy, NarrowTy);
353 
354   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
355   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
356   int NumOrigSrc = VRegs.size();
357 
358   Register PadReg;
359 
360   // Get a value we can use to pad the source value if the sources won't evenly
361   // cover the result type.
362   if (NumOrigSrc < NumParts * NumSubParts) {
363     if (PadStrategy == TargetOpcode::G_ZEXT)
364       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
365     else if (PadStrategy == TargetOpcode::G_ANYEXT)
366       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
367     else {
368       assert(PadStrategy == TargetOpcode::G_SEXT);
369 
370       // Shift the sign bit of the low register through the high register.
371       auto ShiftAmt =
372         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
373       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
374     }
375   }
376 
377   // Registers for the final merge to be produced.
378   SmallVector<Register, 4> Remerge(NumParts);
379 
380   // Registers needed for intermediate merges, which will be merged into a
381   // source for Remerge.
382   SmallVector<Register, 4> SubMerge(NumSubParts);
383 
384   // Once we've fully read off the end of the original source bits, we can reuse
385   // the same high bits for remaining padding elements.
386   Register AllPadReg;
387 
388   // Build merges to the LCM type to cover the original result type.
389   for (int I = 0; I != NumParts; ++I) {
390     bool AllMergePartsArePadding = true;
391 
392     // Build the requested merges to the requested type.
393     for (int J = 0; J != NumSubParts; ++J) {
394       int Idx = I * NumSubParts + J;
395       if (Idx >= NumOrigSrc) {
396         SubMerge[J] = PadReg;
397         continue;
398       }
399 
400       SubMerge[J] = VRegs[Idx];
401 
402       // There are meaningful bits here we can't reuse later.
403       AllMergePartsArePadding = false;
404     }
405 
406     // If we've filled up a complete piece with padding bits, we can directly
407     // emit the natural sized constant if applicable, rather than a merge of
408     // smaller constants.
409     if (AllMergePartsArePadding && !AllPadReg) {
410       if (PadStrategy == TargetOpcode::G_ANYEXT)
411         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
412       else if (PadStrategy == TargetOpcode::G_ZEXT)
413         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
414 
415       // If this is a sign extension, we can't materialize a trivial constant
416       // with the right type and have to produce a merge.
417     }
418 
419     if (AllPadReg) {
420       // Avoid creating additional instructions if we're just adding additional
421       // copies of padding bits.
422       Remerge[I] = AllPadReg;
423       continue;
424     }
425 
426     if (NumSubParts == 1)
427       Remerge[I] = SubMerge[0];
428     else
429       Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
430 
431     // In the sign extend padding case, re-use the first all-signbit merge.
432     if (AllMergePartsArePadding && !AllPadReg)
433       AllPadReg = Remerge[I];
434   }
435 
436   VRegs = std::move(Remerge);
437   return LCMTy;
438 }
439 
440 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
441                                                ArrayRef<Register> RemergeRegs) {
442   LLT DstTy = MRI.getType(DstReg);
443 
444   // Create the merge to the widened source, and extract the relevant bits into
445   // the result.
446 
447   if (DstTy == LCMTy) {
448     MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
449     return;
450   }
451 
452   auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
453   if (DstTy.isScalar() && LCMTy.isScalar()) {
454     MIRBuilder.buildTrunc(DstReg, Remerge);
455     return;
456   }
457 
458   if (LCMTy.isVector()) {
459     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
460     SmallVector<Register, 8> UnmergeDefs(NumDefs);
461     UnmergeDefs[0] = DstReg;
462     for (unsigned I = 1; I != NumDefs; ++I)
463       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
464 
465     MIRBuilder.buildUnmerge(UnmergeDefs,
466                             MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
467     return;
468   }
469 
470   llvm_unreachable("unhandled case");
471 }
472 
473 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
474 #define RTLIBCASE_INT(LibcallPrefix)                                           \
475   do {                                                                         \
476     switch (Size) {                                                            \
477     case 32:                                                                   \
478       return RTLIB::LibcallPrefix##32;                                         \
479     case 64:                                                                   \
480       return RTLIB::LibcallPrefix##64;                                         \
481     case 128:                                                                  \
482       return RTLIB::LibcallPrefix##128;                                        \
483     default:                                                                   \
484       llvm_unreachable("unexpected size");                                     \
485     }                                                                          \
486   } while (0)
487 
488 #define RTLIBCASE(LibcallPrefix)                                               \
489   do {                                                                         \
490     switch (Size) {                                                            \
491     case 32:                                                                   \
492       return RTLIB::LibcallPrefix##32;                                         \
493     case 64:                                                                   \
494       return RTLIB::LibcallPrefix##64;                                         \
495     case 80:                                                                   \
496       return RTLIB::LibcallPrefix##80;                                         \
497     case 128:                                                                  \
498       return RTLIB::LibcallPrefix##128;                                        \
499     default:                                                                   \
500       llvm_unreachable("unexpected size");                                     \
501     }                                                                          \
502   } while (0)
503 
504   switch (Opcode) {
505   case TargetOpcode::G_MUL:
506     RTLIBCASE_INT(MUL_I);
507   case TargetOpcode::G_SDIV:
508     RTLIBCASE_INT(SDIV_I);
509   case TargetOpcode::G_UDIV:
510     RTLIBCASE_INT(UDIV_I);
511   case TargetOpcode::G_SREM:
512     RTLIBCASE_INT(SREM_I);
513   case TargetOpcode::G_UREM:
514     RTLIBCASE_INT(UREM_I);
515   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
516     RTLIBCASE_INT(CTLZ_I);
517   case TargetOpcode::G_FADD:
518     RTLIBCASE(ADD_F);
519   case TargetOpcode::G_FSUB:
520     RTLIBCASE(SUB_F);
521   case TargetOpcode::G_FMUL:
522     RTLIBCASE(MUL_F);
523   case TargetOpcode::G_FDIV:
524     RTLIBCASE(DIV_F);
525   case TargetOpcode::G_FEXP:
526     RTLIBCASE(EXP_F);
527   case TargetOpcode::G_FEXP2:
528     RTLIBCASE(EXP2_F);
529   case TargetOpcode::G_FREM:
530     RTLIBCASE(REM_F);
531   case TargetOpcode::G_FPOW:
532     RTLIBCASE(POW_F);
533   case TargetOpcode::G_FMA:
534     RTLIBCASE(FMA_F);
535   case TargetOpcode::G_FSIN:
536     RTLIBCASE(SIN_F);
537   case TargetOpcode::G_FCOS:
538     RTLIBCASE(COS_F);
539   case TargetOpcode::G_FLOG10:
540     RTLIBCASE(LOG10_F);
541   case TargetOpcode::G_FLOG:
542     RTLIBCASE(LOG_F);
543   case TargetOpcode::G_FLOG2:
544     RTLIBCASE(LOG2_F);
545   case TargetOpcode::G_FLDEXP:
546     RTLIBCASE(LDEXP_F);
547   case TargetOpcode::G_FCEIL:
548     RTLIBCASE(CEIL_F);
549   case TargetOpcode::G_FFLOOR:
550     RTLIBCASE(FLOOR_F);
551   case TargetOpcode::G_FMINNUM:
552     RTLIBCASE(FMIN_F);
553   case TargetOpcode::G_FMAXNUM:
554     RTLIBCASE(FMAX_F);
555   case TargetOpcode::G_FSQRT:
556     RTLIBCASE(SQRT_F);
557   case TargetOpcode::G_FRINT:
558     RTLIBCASE(RINT_F);
559   case TargetOpcode::G_FNEARBYINT:
560     RTLIBCASE(NEARBYINT_F);
561   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
562     RTLIBCASE(ROUNDEVEN_F);
563   }
564   llvm_unreachable("Unknown libcall function");
565 }
566 
567 /// True if an instruction is in tail position in its caller. Intended for
568 /// legalizing libcalls as tail calls when possible.
569 static bool isLibCallInTailPosition(MachineInstr &MI,
570                                     const TargetInstrInfo &TII,
571                                     MachineRegisterInfo &MRI) {
572   MachineBasicBlock &MBB = *MI.getParent();
573   const Function &F = MBB.getParent()->getFunction();
574 
575   // Conservatively require the attributes of the call to match those of
576   // the return. Ignore NoAlias and NonNull because they don't affect the
577   // call sequence.
578   AttributeList CallerAttrs = F.getAttributes();
579   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
580           .removeAttribute(Attribute::NoAlias)
581           .removeAttribute(Attribute::NonNull)
582           .hasAttributes())
583     return false;
584 
585   // It's not safe to eliminate the sign / zero extension of the return value.
586   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
587       CallerAttrs.hasRetAttr(Attribute::SExt))
588     return false;
589 
590   // Only tail call if the following instruction is a standard return or if we
591   // have a `thisreturn` callee, and a sequence like:
592   //
593   //   G_MEMCPY %0, %1, %2
594   //   $x0 = COPY %0
595   //   RET_ReallyLR implicit $x0
596   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
597   if (Next != MBB.instr_end() && Next->isCopy()) {
598     switch (MI.getOpcode()) {
599     default:
600       llvm_unreachable("unsupported opcode");
601     case TargetOpcode::G_BZERO:
602       return false;
603     case TargetOpcode::G_MEMCPY:
604     case TargetOpcode::G_MEMMOVE:
605     case TargetOpcode::G_MEMSET:
606       break;
607     }
608 
609     Register VReg = MI.getOperand(0).getReg();
610     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
611       return false;
612 
613     Register PReg = Next->getOperand(0).getReg();
614     if (!PReg.isPhysical())
615       return false;
616 
617     auto Ret = next_nodbg(Next, MBB.instr_end());
618     if (Ret == MBB.instr_end() || !Ret->isReturn())
619       return false;
620 
621     if (Ret->getNumImplicitOperands() != 1)
622       return false;
623 
624     if (PReg != Ret->getOperand(0).getReg())
625       return false;
626 
627     // Skip over the COPY that we just validated.
628     Next = Ret;
629   }
630 
631   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
632     return false;
633 
634   return true;
635 }
636 
637 LegalizerHelper::LegalizeResult
638 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
639                     const CallLowering::ArgInfo &Result,
640                     ArrayRef<CallLowering::ArgInfo> Args,
641                     const CallingConv::ID CC) {
642   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
643 
644   CallLowering::CallLoweringInfo Info;
645   Info.CallConv = CC;
646   Info.Callee = MachineOperand::CreateES(Name);
647   Info.OrigRet = Result;
648   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
649   if (!CLI.lowerCall(MIRBuilder, Info))
650     return LegalizerHelper::UnableToLegalize;
651 
652   return LegalizerHelper::Legalized;
653 }
654 
655 LegalizerHelper::LegalizeResult
656 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
657                     const CallLowering::ArgInfo &Result,
658                     ArrayRef<CallLowering::ArgInfo> Args) {
659   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
660   const char *Name = TLI.getLibcallName(Libcall);
661   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
662   return createLibcall(MIRBuilder, Name, Result, Args, CC);
663 }
664 
665 // Useful for libcalls where all operands have the same type.
666 static LegalizerHelper::LegalizeResult
667 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
668               Type *OpType) {
669   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
670 
671   // FIXME: What does the original arg index mean here?
672   SmallVector<CallLowering::ArgInfo, 3> Args;
673   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
674     Args.push_back({MO.getReg(), OpType, 0});
675   return createLibcall(MIRBuilder, Libcall,
676                        {MI.getOperand(0).getReg(), OpType, 0}, Args);
677 }
678 
679 LegalizerHelper::LegalizeResult
680 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
681                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
682   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
683 
684   SmallVector<CallLowering::ArgInfo, 3> Args;
685   // Add all the args, except for the last which is an imm denoting 'tail'.
686   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
687     Register Reg = MI.getOperand(i).getReg();
688 
689     // Need derive an IR type for call lowering.
690     LLT OpLLT = MRI.getType(Reg);
691     Type *OpTy = nullptr;
692     if (OpLLT.isPointer())
693       OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
694     else
695       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
696     Args.push_back({Reg, OpTy, 0});
697   }
698 
699   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
700   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
701   RTLIB::Libcall RTLibcall;
702   unsigned Opc = MI.getOpcode();
703   switch (Opc) {
704   case TargetOpcode::G_BZERO:
705     RTLibcall = RTLIB::BZERO;
706     break;
707   case TargetOpcode::G_MEMCPY:
708     RTLibcall = RTLIB::MEMCPY;
709     Args[0].Flags[0].setReturned();
710     break;
711   case TargetOpcode::G_MEMMOVE:
712     RTLibcall = RTLIB::MEMMOVE;
713     Args[0].Flags[0].setReturned();
714     break;
715   case TargetOpcode::G_MEMSET:
716     RTLibcall = RTLIB::MEMSET;
717     Args[0].Flags[0].setReturned();
718     break;
719   default:
720     llvm_unreachable("unsupported opcode");
721   }
722   const char *Name = TLI.getLibcallName(RTLibcall);
723 
724   // Unsupported libcall on the target.
725   if (!Name) {
726     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
727                       << MIRBuilder.getTII().getName(Opc) << "\n");
728     return LegalizerHelper::UnableToLegalize;
729   }
730 
731   CallLowering::CallLoweringInfo Info;
732   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
733   Info.Callee = MachineOperand::CreateES(Name);
734   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
735   Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() &&
736                     isLibCallInTailPosition(MI, MIRBuilder.getTII(), MRI);
737 
738   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
739   if (!CLI.lowerCall(MIRBuilder, Info))
740     return LegalizerHelper::UnableToLegalize;
741 
742   if (Info.LoweredTailCall) {
743     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
744 
745     // Check debug locations before removing the return.
746     LocObserver.checkpoint(true);
747 
748     // We must have a return following the call (or debug insts) to get past
749     // isLibCallInTailPosition.
750     do {
751       MachineInstr *Next = MI.getNextNode();
752       assert(Next &&
753              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
754              "Expected instr following MI to be return or debug inst?");
755       // We lowered a tail call, so the call is now the return from the block.
756       // Delete the old return.
757       Next->eraseFromParent();
758     } while (MI.getNextNode());
759 
760     // We expect to lose the debug location from the return.
761     LocObserver.checkpoint(false);
762   }
763 
764   return LegalizerHelper::Legalized;
765 }
766 
767 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
768                                        Type *FromType) {
769   auto ToMVT = MVT::getVT(ToType);
770   auto FromMVT = MVT::getVT(FromType);
771 
772   switch (Opcode) {
773   case TargetOpcode::G_FPEXT:
774     return RTLIB::getFPEXT(FromMVT, ToMVT);
775   case TargetOpcode::G_FPTRUNC:
776     return RTLIB::getFPROUND(FromMVT, ToMVT);
777   case TargetOpcode::G_FPTOSI:
778     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
779   case TargetOpcode::G_FPTOUI:
780     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
781   case TargetOpcode::G_SITOFP:
782     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
783   case TargetOpcode::G_UITOFP:
784     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
785   }
786   llvm_unreachable("Unsupported libcall function");
787 }
788 
789 static LegalizerHelper::LegalizeResult
790 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
791                   Type *FromType) {
792   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
793   return createLibcall(MIRBuilder, Libcall,
794                        {MI.getOperand(0).getReg(), ToType, 0},
795                        {{MI.getOperand(1).getReg(), FromType, 0}});
796 }
797 
798 LegalizerHelper::LegalizeResult
799 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
800   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
801   unsigned Size = LLTy.getSizeInBits();
802   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
803 
804   switch (MI.getOpcode()) {
805   default:
806     return UnableToLegalize;
807   case TargetOpcode::G_MUL:
808   case TargetOpcode::G_SDIV:
809   case TargetOpcode::G_UDIV:
810   case TargetOpcode::G_SREM:
811   case TargetOpcode::G_UREM:
812   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
813     Type *HLTy = IntegerType::get(Ctx, Size);
814     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
815     if (Status != Legalized)
816       return Status;
817     break;
818   }
819   case TargetOpcode::G_FADD:
820   case TargetOpcode::G_FSUB:
821   case TargetOpcode::G_FMUL:
822   case TargetOpcode::G_FDIV:
823   case TargetOpcode::G_FMA:
824   case TargetOpcode::G_FPOW:
825   case TargetOpcode::G_FREM:
826   case TargetOpcode::G_FCOS:
827   case TargetOpcode::G_FSIN:
828   case TargetOpcode::G_FLOG10:
829   case TargetOpcode::G_FLOG:
830   case TargetOpcode::G_FLOG2:
831   case TargetOpcode::G_FLDEXP:
832   case TargetOpcode::G_FEXP:
833   case TargetOpcode::G_FEXP2:
834   case TargetOpcode::G_FCEIL:
835   case TargetOpcode::G_FFLOOR:
836   case TargetOpcode::G_FMINNUM:
837   case TargetOpcode::G_FMAXNUM:
838   case TargetOpcode::G_FSQRT:
839   case TargetOpcode::G_FRINT:
840   case TargetOpcode::G_FNEARBYINT:
841   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
842     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
843     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
844       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
845       return UnableToLegalize;
846     }
847     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
848     if (Status != Legalized)
849       return Status;
850     break;
851   }
852   case TargetOpcode::G_FPEXT:
853   case TargetOpcode::G_FPTRUNC: {
854     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
855     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
856     if (!FromTy || !ToTy)
857       return UnableToLegalize;
858     LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
859     if (Status != Legalized)
860       return Status;
861     break;
862   }
863   case TargetOpcode::G_FPTOSI:
864   case TargetOpcode::G_FPTOUI: {
865     // FIXME: Support other types
866     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
867     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
868     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
869       return UnableToLegalize;
870     LegalizeResult Status = conversionLibcall(
871         MI, MIRBuilder,
872         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
873         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx));
874     if (Status != Legalized)
875       return Status;
876     break;
877   }
878   case TargetOpcode::G_SITOFP:
879   case TargetOpcode::G_UITOFP: {
880     // FIXME: Support other types
881     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
882     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
883     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
884       return UnableToLegalize;
885     LegalizeResult Status = conversionLibcall(
886         MI, MIRBuilder,
887         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
888         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx));
889     if (Status != Legalized)
890       return Status;
891     break;
892   }
893   case TargetOpcode::G_BZERO:
894   case TargetOpcode::G_MEMCPY:
895   case TargetOpcode::G_MEMMOVE:
896   case TargetOpcode::G_MEMSET: {
897     LegalizeResult Result =
898         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
899     if (Result != Legalized)
900       return Result;
901     MI.eraseFromParent();
902     return Result;
903   }
904   }
905 
906   MI.eraseFromParent();
907   return Legalized;
908 }
909 
910 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
911                                                               unsigned TypeIdx,
912                                                               LLT NarrowTy) {
913   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
914   uint64_t NarrowSize = NarrowTy.getSizeInBits();
915 
916   switch (MI.getOpcode()) {
917   default:
918     return UnableToLegalize;
919   case TargetOpcode::G_IMPLICIT_DEF: {
920     Register DstReg = MI.getOperand(0).getReg();
921     LLT DstTy = MRI.getType(DstReg);
922 
923     // If SizeOp0 is not an exact multiple of NarrowSize, emit
924     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
925     // FIXME: Although this would also be legal for the general case, it causes
926     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
927     //  combines not being hit). This seems to be a problem related to the
928     //  artifact combiner.
929     if (SizeOp0 % NarrowSize != 0) {
930       LLT ImplicitTy = NarrowTy;
931       if (DstTy.isVector())
932         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
933 
934       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
935       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
936 
937       MI.eraseFromParent();
938       return Legalized;
939     }
940 
941     int NumParts = SizeOp0 / NarrowSize;
942 
943     SmallVector<Register, 2> DstRegs;
944     for (int i = 0; i < NumParts; ++i)
945       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
946 
947     if (DstTy.isVector())
948       MIRBuilder.buildBuildVector(DstReg, DstRegs);
949     else
950       MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
951     MI.eraseFromParent();
952     return Legalized;
953   }
954   case TargetOpcode::G_CONSTANT: {
955     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
956     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
957     unsigned TotalSize = Ty.getSizeInBits();
958     unsigned NarrowSize = NarrowTy.getSizeInBits();
959     int NumParts = TotalSize / NarrowSize;
960 
961     SmallVector<Register, 4> PartRegs;
962     for (int I = 0; I != NumParts; ++I) {
963       unsigned Offset = I * NarrowSize;
964       auto K = MIRBuilder.buildConstant(NarrowTy,
965                                         Val.lshr(Offset).trunc(NarrowSize));
966       PartRegs.push_back(K.getReg(0));
967     }
968 
969     LLT LeftoverTy;
970     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
971     SmallVector<Register, 1> LeftoverRegs;
972     if (LeftoverBits != 0) {
973       LeftoverTy = LLT::scalar(LeftoverBits);
974       auto K = MIRBuilder.buildConstant(
975         LeftoverTy,
976         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
977       LeftoverRegs.push_back(K.getReg(0));
978     }
979 
980     insertParts(MI.getOperand(0).getReg(),
981                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
982 
983     MI.eraseFromParent();
984     return Legalized;
985   }
986   case TargetOpcode::G_SEXT:
987   case TargetOpcode::G_ZEXT:
988   case TargetOpcode::G_ANYEXT:
989     return narrowScalarExt(MI, TypeIdx, NarrowTy);
990   case TargetOpcode::G_TRUNC: {
991     if (TypeIdx != 1)
992       return UnableToLegalize;
993 
994     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
995     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
996       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
997       return UnableToLegalize;
998     }
999 
1000     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1001     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1002     MI.eraseFromParent();
1003     return Legalized;
1004   }
1005 
1006   case TargetOpcode::G_FREEZE: {
1007     if (TypeIdx != 0)
1008       return UnableToLegalize;
1009 
1010     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1011     // Should widen scalar first
1012     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1013       return UnableToLegalize;
1014 
1015     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1016     SmallVector<Register, 8> Parts;
1017     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1018       Parts.push_back(
1019           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1020     }
1021 
1022     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1023     MI.eraseFromParent();
1024     return Legalized;
1025   }
1026   case TargetOpcode::G_ADD:
1027   case TargetOpcode::G_SUB:
1028   case TargetOpcode::G_SADDO:
1029   case TargetOpcode::G_SSUBO:
1030   case TargetOpcode::G_SADDE:
1031   case TargetOpcode::G_SSUBE:
1032   case TargetOpcode::G_UADDO:
1033   case TargetOpcode::G_USUBO:
1034   case TargetOpcode::G_UADDE:
1035   case TargetOpcode::G_USUBE:
1036     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1037   case TargetOpcode::G_MUL:
1038   case TargetOpcode::G_UMULH:
1039     return narrowScalarMul(MI, NarrowTy);
1040   case TargetOpcode::G_EXTRACT:
1041     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1042   case TargetOpcode::G_INSERT:
1043     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1044   case TargetOpcode::G_LOAD: {
1045     auto &LoadMI = cast<GLoad>(MI);
1046     Register DstReg = LoadMI.getDstReg();
1047     LLT DstTy = MRI.getType(DstReg);
1048     if (DstTy.isVector())
1049       return UnableToLegalize;
1050 
1051     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1052       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1053       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1054       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1055       LoadMI.eraseFromParent();
1056       return Legalized;
1057     }
1058 
1059     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1060   }
1061   case TargetOpcode::G_ZEXTLOAD:
1062   case TargetOpcode::G_SEXTLOAD: {
1063     auto &LoadMI = cast<GExtLoad>(MI);
1064     Register DstReg = LoadMI.getDstReg();
1065     Register PtrReg = LoadMI.getPointerReg();
1066 
1067     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1068     auto &MMO = LoadMI.getMMO();
1069     unsigned MemSize = MMO.getSizeInBits();
1070 
1071     if (MemSize == NarrowSize) {
1072       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1073     } else if (MemSize < NarrowSize) {
1074       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1075     } else if (MemSize > NarrowSize) {
1076       // FIXME: Need to split the load.
1077       return UnableToLegalize;
1078     }
1079 
1080     if (isa<GZExtLoad>(LoadMI))
1081       MIRBuilder.buildZExt(DstReg, TmpReg);
1082     else
1083       MIRBuilder.buildSExt(DstReg, TmpReg);
1084 
1085     LoadMI.eraseFromParent();
1086     return Legalized;
1087   }
1088   case TargetOpcode::G_STORE: {
1089     auto &StoreMI = cast<GStore>(MI);
1090 
1091     Register SrcReg = StoreMI.getValueReg();
1092     LLT SrcTy = MRI.getType(SrcReg);
1093     if (SrcTy.isVector())
1094       return UnableToLegalize;
1095 
1096     int NumParts = SizeOp0 / NarrowSize;
1097     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1098     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1099     if (SrcTy.isVector() && LeftoverBits != 0)
1100       return UnableToLegalize;
1101 
1102     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1103       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1104       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1105       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1106       StoreMI.eraseFromParent();
1107       return Legalized;
1108     }
1109 
1110     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1111   }
1112   case TargetOpcode::G_SELECT:
1113     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1114   case TargetOpcode::G_AND:
1115   case TargetOpcode::G_OR:
1116   case TargetOpcode::G_XOR: {
1117     // Legalize bitwise operation:
1118     // A = BinOp<Ty> B, C
1119     // into:
1120     // B1, ..., BN = G_UNMERGE_VALUES B
1121     // C1, ..., CN = G_UNMERGE_VALUES C
1122     // A1 = BinOp<Ty/N> B1, C2
1123     // ...
1124     // AN = BinOp<Ty/N> BN, CN
1125     // A = G_MERGE_VALUES A1, ..., AN
1126     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1127   }
1128   case TargetOpcode::G_SHL:
1129   case TargetOpcode::G_LSHR:
1130   case TargetOpcode::G_ASHR:
1131     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1132   case TargetOpcode::G_CTLZ:
1133   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1134   case TargetOpcode::G_CTTZ:
1135   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1136   case TargetOpcode::G_CTPOP:
1137     if (TypeIdx == 1)
1138       switch (MI.getOpcode()) {
1139       case TargetOpcode::G_CTLZ:
1140       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1141         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1142       case TargetOpcode::G_CTTZ:
1143       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1144         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1145       case TargetOpcode::G_CTPOP:
1146         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1147       default:
1148         return UnableToLegalize;
1149       }
1150 
1151     Observer.changingInstr(MI);
1152     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1153     Observer.changedInstr(MI);
1154     return Legalized;
1155   case TargetOpcode::G_INTTOPTR:
1156     if (TypeIdx != 1)
1157       return UnableToLegalize;
1158 
1159     Observer.changingInstr(MI);
1160     narrowScalarSrc(MI, NarrowTy, 1);
1161     Observer.changedInstr(MI);
1162     return Legalized;
1163   case TargetOpcode::G_PTRTOINT:
1164     if (TypeIdx != 0)
1165       return UnableToLegalize;
1166 
1167     Observer.changingInstr(MI);
1168     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1169     Observer.changedInstr(MI);
1170     return Legalized;
1171   case TargetOpcode::G_PHI: {
1172     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1173     // NarrowSize.
1174     if (SizeOp0 % NarrowSize != 0)
1175       return UnableToLegalize;
1176 
1177     unsigned NumParts = SizeOp0 / NarrowSize;
1178     SmallVector<Register, 2> DstRegs(NumParts);
1179     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1180     Observer.changingInstr(MI);
1181     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1182       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1183       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1184       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1185                    SrcRegs[i / 2]);
1186     }
1187     MachineBasicBlock &MBB = *MI.getParent();
1188     MIRBuilder.setInsertPt(MBB, MI);
1189     for (unsigned i = 0; i < NumParts; ++i) {
1190       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1191       MachineInstrBuilder MIB =
1192           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1193       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1194         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1195     }
1196     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1197     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1198     Observer.changedInstr(MI);
1199     MI.eraseFromParent();
1200     return Legalized;
1201   }
1202   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1203   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1204     if (TypeIdx != 2)
1205       return UnableToLegalize;
1206 
1207     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1208     Observer.changingInstr(MI);
1209     narrowScalarSrc(MI, NarrowTy, OpIdx);
1210     Observer.changedInstr(MI);
1211     return Legalized;
1212   }
1213   case TargetOpcode::G_ICMP: {
1214     Register LHS = MI.getOperand(2).getReg();
1215     LLT SrcTy = MRI.getType(LHS);
1216     uint64_t SrcSize = SrcTy.getSizeInBits();
1217     CmpInst::Predicate Pred =
1218         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1219 
1220     // TODO: Handle the non-equality case for weird sizes.
1221     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1222       return UnableToLegalize;
1223 
1224     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1225     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1226     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1227                       LHSLeftoverRegs))
1228       return UnableToLegalize;
1229 
1230     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1231     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1232     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1233                       RHSPartRegs, RHSLeftoverRegs))
1234       return UnableToLegalize;
1235 
1236     // We now have the LHS and RHS of the compare split into narrow-type
1237     // registers, plus potentially some leftover type.
1238     Register Dst = MI.getOperand(0).getReg();
1239     LLT ResTy = MRI.getType(Dst);
1240     if (ICmpInst::isEquality(Pred)) {
1241       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1242       // them together. For each equal part, the result should be all 0s. For
1243       // each non-equal part, we'll get at least one 1.
1244       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1245       SmallVector<Register, 4> Xors;
1246       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1247         auto LHS = std::get<0>(LHSAndRHS);
1248         auto RHS = std::get<1>(LHSAndRHS);
1249         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1250         Xors.push_back(Xor);
1251       }
1252 
1253       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1254       // to the desired narrow type so that we can OR them together later.
1255       SmallVector<Register, 4> WidenedXors;
1256       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1257         auto LHS = std::get<0>(LHSAndRHS);
1258         auto RHS = std::get<1>(LHSAndRHS);
1259         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1260         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1261         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1262                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1263         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1264       }
1265 
1266       // Now, for each part we broke up, we know if they are equal/not equal
1267       // based off the G_XOR. We can OR these all together and compare against
1268       // 0 to get the result.
1269       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1270       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1271       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1272         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1273       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1274     } else {
1275       // TODO: Handle non-power-of-two types.
1276       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1277       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1278       Register LHSL = LHSPartRegs[0];
1279       Register LHSH = LHSPartRegs[1];
1280       Register RHSL = RHSPartRegs[0];
1281       Register RHSH = RHSPartRegs[1];
1282       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1283       MachineInstrBuilder CmpHEQ =
1284           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1285       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1286           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1287       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1288     }
1289     MI.eraseFromParent();
1290     return Legalized;
1291   }
1292   case TargetOpcode::G_SEXT_INREG: {
1293     if (TypeIdx != 0)
1294       return UnableToLegalize;
1295 
1296     int64_t SizeInBits = MI.getOperand(2).getImm();
1297 
1298     // So long as the new type has more bits than the bits we're extending we
1299     // don't need to break it apart.
1300     if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1301       Observer.changingInstr(MI);
1302       // We don't lose any non-extension bits by truncating the src and
1303       // sign-extending the dst.
1304       MachineOperand &MO1 = MI.getOperand(1);
1305       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1306       MO1.setReg(TruncMIB.getReg(0));
1307 
1308       MachineOperand &MO2 = MI.getOperand(0);
1309       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1310       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1311       MIRBuilder.buildSExt(MO2, DstExt);
1312       MO2.setReg(DstExt);
1313       Observer.changedInstr(MI);
1314       return Legalized;
1315     }
1316 
1317     // Break it apart. Components below the extension point are unmodified. The
1318     // component containing the extension point becomes a narrower SEXT_INREG.
1319     // Components above it are ashr'd from the component containing the
1320     // extension point.
1321     if (SizeOp0 % NarrowSize != 0)
1322       return UnableToLegalize;
1323     int NumParts = SizeOp0 / NarrowSize;
1324 
1325     // List the registers where the destination will be scattered.
1326     SmallVector<Register, 2> DstRegs;
1327     // List the registers where the source will be split.
1328     SmallVector<Register, 2> SrcRegs;
1329 
1330     // Create all the temporary registers.
1331     for (int i = 0; i < NumParts; ++i) {
1332       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1333 
1334       SrcRegs.push_back(SrcReg);
1335     }
1336 
1337     // Explode the big arguments into smaller chunks.
1338     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1339 
1340     Register AshrCstReg =
1341         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1342             .getReg(0);
1343     Register FullExtensionReg = 0;
1344     Register PartialExtensionReg = 0;
1345 
1346     // Do the operation on each small part.
1347     for (int i = 0; i < NumParts; ++i) {
1348       if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
1349         DstRegs.push_back(SrcRegs[i]);
1350       else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
1351         assert(PartialExtensionReg &&
1352                "Expected to visit partial extension before full");
1353         if (FullExtensionReg) {
1354           DstRegs.push_back(FullExtensionReg);
1355           continue;
1356         }
1357         DstRegs.push_back(
1358             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1359                 .getReg(0));
1360         FullExtensionReg = DstRegs.back();
1361       } else {
1362         DstRegs.push_back(
1363             MIRBuilder
1364                 .buildInstr(
1365                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1366                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1367                 .getReg(0));
1368         PartialExtensionReg = DstRegs.back();
1369       }
1370     }
1371 
1372     // Gather the destination registers into the final destination.
1373     Register DstReg = MI.getOperand(0).getReg();
1374     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1375     MI.eraseFromParent();
1376     return Legalized;
1377   }
1378   case TargetOpcode::G_BSWAP:
1379   case TargetOpcode::G_BITREVERSE: {
1380     if (SizeOp0 % NarrowSize != 0)
1381       return UnableToLegalize;
1382 
1383     Observer.changingInstr(MI);
1384     SmallVector<Register, 2> SrcRegs, DstRegs;
1385     unsigned NumParts = SizeOp0 / NarrowSize;
1386     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
1387 
1388     for (unsigned i = 0; i < NumParts; ++i) {
1389       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1390                                            {SrcRegs[NumParts - 1 - i]});
1391       DstRegs.push_back(DstPart.getReg(0));
1392     }
1393 
1394     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1395 
1396     Observer.changedInstr(MI);
1397     MI.eraseFromParent();
1398     return Legalized;
1399   }
1400   case TargetOpcode::G_PTR_ADD:
1401   case TargetOpcode::G_PTRMASK: {
1402     if (TypeIdx != 1)
1403       return UnableToLegalize;
1404     Observer.changingInstr(MI);
1405     narrowScalarSrc(MI, NarrowTy, 2);
1406     Observer.changedInstr(MI);
1407     return Legalized;
1408   }
1409   case TargetOpcode::G_FPTOUI:
1410   case TargetOpcode::G_FPTOSI:
1411     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1412   case TargetOpcode::G_FPEXT:
1413     if (TypeIdx != 0)
1414       return UnableToLegalize;
1415     Observer.changingInstr(MI);
1416     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1417     Observer.changedInstr(MI);
1418     return Legalized;
1419   case TargetOpcode::G_FLDEXP:
1420   case TargetOpcode::G_STRICT_FLDEXP:
1421     return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
1422   }
1423 }
1424 
1425 Register LegalizerHelper::coerceToScalar(Register Val) {
1426   LLT Ty = MRI.getType(Val);
1427   if (Ty.isScalar())
1428     return Val;
1429 
1430   const DataLayout &DL = MIRBuilder.getDataLayout();
1431   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1432   if (Ty.isPointer()) {
1433     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1434       return Register();
1435     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1436   }
1437 
1438   Register NewVal = Val;
1439 
1440   assert(Ty.isVector());
1441   LLT EltTy = Ty.getElementType();
1442   if (EltTy.isPointer())
1443     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1444   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1445 }
1446 
1447 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1448                                      unsigned OpIdx, unsigned ExtOpcode) {
1449   MachineOperand &MO = MI.getOperand(OpIdx);
1450   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1451   MO.setReg(ExtB.getReg(0));
1452 }
1453 
1454 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1455                                       unsigned OpIdx) {
1456   MachineOperand &MO = MI.getOperand(OpIdx);
1457   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1458   MO.setReg(ExtB.getReg(0));
1459 }
1460 
1461 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1462                                      unsigned OpIdx, unsigned TruncOpcode) {
1463   MachineOperand &MO = MI.getOperand(OpIdx);
1464   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1465   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1466   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1467   MO.setReg(DstExt);
1468 }
1469 
1470 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1471                                       unsigned OpIdx, unsigned ExtOpcode) {
1472   MachineOperand &MO = MI.getOperand(OpIdx);
1473   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1474   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1475   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1476   MO.setReg(DstTrunc);
1477 }
1478 
1479 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1480                                             unsigned OpIdx) {
1481   MachineOperand &MO = MI.getOperand(OpIdx);
1482   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1483   Register Dst = MO.getReg();
1484   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1485   MO.setReg(DstExt);
1486   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1487 }
1488 
1489 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1490                                             unsigned OpIdx) {
1491   MachineOperand &MO = MI.getOperand(OpIdx);
1492   SmallVector<Register, 8> Regs;
1493   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1494 }
1495 
1496 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1497   MachineOperand &Op = MI.getOperand(OpIdx);
1498   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1499 }
1500 
1501 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1502   MachineOperand &MO = MI.getOperand(OpIdx);
1503   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1504   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1505   MIRBuilder.buildBitcast(MO, CastDst);
1506   MO.setReg(CastDst);
1507 }
1508 
1509 LegalizerHelper::LegalizeResult
1510 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1511                                         LLT WideTy) {
1512   if (TypeIdx != 1)
1513     return UnableToLegalize;
1514 
1515   auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
1516   if (DstTy.isVector())
1517     return UnableToLegalize;
1518 
1519   LLT SrcTy = MRI.getType(Src1Reg);
1520   const int DstSize = DstTy.getSizeInBits();
1521   const int SrcSize = SrcTy.getSizeInBits();
1522   const int WideSize = WideTy.getSizeInBits();
1523   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1524 
1525   unsigned NumOps = MI.getNumOperands();
1526   unsigned NumSrc = MI.getNumOperands() - 1;
1527   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1528 
1529   if (WideSize >= DstSize) {
1530     // Directly pack the bits in the target type.
1531     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
1532 
1533     for (unsigned I = 2; I != NumOps; ++I) {
1534       const unsigned Offset = (I - 1) * PartSize;
1535 
1536       Register SrcReg = MI.getOperand(I).getReg();
1537       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1538 
1539       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1540 
1541       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1542         MRI.createGenericVirtualRegister(WideTy);
1543 
1544       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1545       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1546       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1547       ResultReg = NextResult;
1548     }
1549 
1550     if (WideSize > DstSize)
1551       MIRBuilder.buildTrunc(DstReg, ResultReg);
1552     else if (DstTy.isPointer())
1553       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1554 
1555     MI.eraseFromParent();
1556     return Legalized;
1557   }
1558 
1559   // Unmerge the original values to the GCD type, and recombine to the next
1560   // multiple greater than the original type.
1561   //
1562   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1563   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1564   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1565   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1566   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1567   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1568   // %12:_(s12) = G_MERGE_VALUES %10, %11
1569   //
1570   // Padding with undef if necessary:
1571   //
1572   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1573   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1574   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1575   // %7:_(s2) = G_IMPLICIT_DEF
1576   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1577   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1578   // %10:_(s12) = G_MERGE_VALUES %8, %9
1579 
1580   const int GCD = std::gcd(SrcSize, WideSize);
1581   LLT GCDTy = LLT::scalar(GCD);
1582 
1583   SmallVector<Register, 8> Parts;
1584   SmallVector<Register, 8> NewMergeRegs;
1585   SmallVector<Register, 8> Unmerges;
1586   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1587 
1588   // Decompose the original operands if they don't evenly divide.
1589   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1590     Register SrcReg = MO.getReg();
1591     if (GCD == SrcSize) {
1592       Unmerges.push_back(SrcReg);
1593     } else {
1594       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1595       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1596         Unmerges.push_back(Unmerge.getReg(J));
1597     }
1598   }
1599 
1600   // Pad with undef to the next size that is a multiple of the requested size.
1601   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1602     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1603     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1604       Unmerges.push_back(UndefReg);
1605   }
1606 
1607   const int PartsPerGCD = WideSize / GCD;
1608 
1609   // Build merges of each piece.
1610   ArrayRef<Register> Slicer(Unmerges);
1611   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1612     auto Merge =
1613         MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
1614     NewMergeRegs.push_back(Merge.getReg(0));
1615   }
1616 
1617   // A truncate may be necessary if the requested type doesn't evenly divide the
1618   // original result type.
1619   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1620     MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
1621   } else {
1622     auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
1623     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1624   }
1625 
1626   MI.eraseFromParent();
1627   return Legalized;
1628 }
1629 
1630 LegalizerHelper::LegalizeResult
1631 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1632                                           LLT WideTy) {
1633   if (TypeIdx != 0)
1634     return UnableToLegalize;
1635 
1636   int NumDst = MI.getNumOperands() - 1;
1637   Register SrcReg = MI.getOperand(NumDst).getReg();
1638   LLT SrcTy = MRI.getType(SrcReg);
1639   if (SrcTy.isVector())
1640     return UnableToLegalize;
1641 
1642   Register Dst0Reg = MI.getOperand(0).getReg();
1643   LLT DstTy = MRI.getType(Dst0Reg);
1644   if (!DstTy.isScalar())
1645     return UnableToLegalize;
1646 
1647   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1648     if (SrcTy.isPointer()) {
1649       const DataLayout &DL = MIRBuilder.getDataLayout();
1650       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1651         LLVM_DEBUG(
1652             dbgs() << "Not casting non-integral address space integer\n");
1653         return UnableToLegalize;
1654       }
1655 
1656       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1657       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1658     }
1659 
1660     // Widen SrcTy to WideTy. This does not affect the result, but since the
1661     // user requested this size, it is probably better handled than SrcTy and
1662     // should reduce the total number of legalization artifacts.
1663     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1664       SrcTy = WideTy;
1665       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1666     }
1667 
1668     // Theres no unmerge type to target. Directly extract the bits from the
1669     // source type
1670     unsigned DstSize = DstTy.getSizeInBits();
1671 
1672     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1673     for (int I = 1; I != NumDst; ++I) {
1674       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1675       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1676       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1677     }
1678 
1679     MI.eraseFromParent();
1680     return Legalized;
1681   }
1682 
1683   // Extend the source to a wider type.
1684   LLT LCMTy = getLCMType(SrcTy, WideTy);
1685 
1686   Register WideSrc = SrcReg;
1687   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1688     // TODO: If this is an integral address space, cast to integer and anyext.
1689     if (SrcTy.isPointer()) {
1690       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1691       return UnableToLegalize;
1692     }
1693 
1694     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1695   }
1696 
1697   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1698 
1699   // Create a sequence of unmerges and merges to the original results. Since we
1700   // may have widened the source, we will need to pad the results with dead defs
1701   // to cover the source register.
1702   // e.g. widen s48 to s64:
1703   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1704   //
1705   // =>
1706   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1707   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1708   //  ; unpack to GCD type, with extra dead defs
1709   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1710   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1711   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1712   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1713   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1714   const LLT GCDTy = getGCDType(WideTy, DstTy);
1715   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1716   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1717 
1718   // Directly unmerge to the destination without going through a GCD type
1719   // if possible
1720   if (PartsPerRemerge == 1) {
1721     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1722 
1723     for (int I = 0; I != NumUnmerge; ++I) {
1724       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1725 
1726       for (int J = 0; J != PartsPerUnmerge; ++J) {
1727         int Idx = I * PartsPerUnmerge + J;
1728         if (Idx < NumDst)
1729           MIB.addDef(MI.getOperand(Idx).getReg());
1730         else {
1731           // Create dead def for excess components.
1732           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
1733         }
1734       }
1735 
1736       MIB.addUse(Unmerge.getReg(I));
1737     }
1738   } else {
1739     SmallVector<Register, 16> Parts;
1740     for (int J = 0; J != NumUnmerge; ++J)
1741       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
1742 
1743     SmallVector<Register, 8> RemergeParts;
1744     for (int I = 0; I != NumDst; ++I) {
1745       for (int J = 0; J < PartsPerRemerge; ++J) {
1746         const int Idx = I * PartsPerRemerge + J;
1747         RemergeParts.emplace_back(Parts[Idx]);
1748       }
1749 
1750       MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
1751       RemergeParts.clear();
1752     }
1753   }
1754 
1755   MI.eraseFromParent();
1756   return Legalized;
1757 }
1758 
1759 LegalizerHelper::LegalizeResult
1760 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
1761                                     LLT WideTy) {
1762   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
1763   unsigned Offset = MI.getOperand(2).getImm();
1764 
1765   if (TypeIdx == 0) {
1766     if (SrcTy.isVector() || DstTy.isVector())
1767       return UnableToLegalize;
1768 
1769     SrcOp Src(SrcReg);
1770     if (SrcTy.isPointer()) {
1771       // Extracts from pointers can be handled only if they are really just
1772       // simple integers.
1773       const DataLayout &DL = MIRBuilder.getDataLayout();
1774       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
1775         return UnableToLegalize;
1776 
1777       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
1778       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
1779       SrcTy = SrcAsIntTy;
1780     }
1781 
1782     if (DstTy.isPointer())
1783       return UnableToLegalize;
1784 
1785     if (Offset == 0) {
1786       // Avoid a shift in the degenerate case.
1787       MIRBuilder.buildTrunc(DstReg,
1788                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
1789       MI.eraseFromParent();
1790       return Legalized;
1791     }
1792 
1793     // Do a shift in the source type.
1794     LLT ShiftTy = SrcTy;
1795     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1796       Src = MIRBuilder.buildAnyExt(WideTy, Src);
1797       ShiftTy = WideTy;
1798     }
1799 
1800     auto LShr = MIRBuilder.buildLShr(
1801       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
1802     MIRBuilder.buildTrunc(DstReg, LShr);
1803     MI.eraseFromParent();
1804     return Legalized;
1805   }
1806 
1807   if (SrcTy.isScalar()) {
1808     Observer.changingInstr(MI);
1809     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1810     Observer.changedInstr(MI);
1811     return Legalized;
1812   }
1813 
1814   if (!SrcTy.isVector())
1815     return UnableToLegalize;
1816 
1817   if (DstTy != SrcTy.getElementType())
1818     return UnableToLegalize;
1819 
1820   if (Offset % SrcTy.getScalarSizeInBits() != 0)
1821     return UnableToLegalize;
1822 
1823   Observer.changingInstr(MI);
1824   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1825 
1826   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
1827                           Offset);
1828   widenScalarDst(MI, WideTy.getScalarType(), 0);
1829   Observer.changedInstr(MI);
1830   return Legalized;
1831 }
1832 
1833 LegalizerHelper::LegalizeResult
1834 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
1835                                    LLT WideTy) {
1836   if (TypeIdx != 0 || WideTy.isVector())
1837     return UnableToLegalize;
1838   Observer.changingInstr(MI);
1839   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
1840   widenScalarDst(MI, WideTy);
1841   Observer.changedInstr(MI);
1842   return Legalized;
1843 }
1844 
1845 LegalizerHelper::LegalizeResult
1846 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
1847                                            LLT WideTy) {
1848   unsigned Opcode;
1849   unsigned ExtOpcode;
1850   std::optional<Register> CarryIn;
1851   switch (MI.getOpcode()) {
1852   default:
1853     llvm_unreachable("Unexpected opcode!");
1854   case TargetOpcode::G_SADDO:
1855     Opcode = TargetOpcode::G_ADD;
1856     ExtOpcode = TargetOpcode::G_SEXT;
1857     break;
1858   case TargetOpcode::G_SSUBO:
1859     Opcode = TargetOpcode::G_SUB;
1860     ExtOpcode = TargetOpcode::G_SEXT;
1861     break;
1862   case TargetOpcode::G_UADDO:
1863     Opcode = TargetOpcode::G_ADD;
1864     ExtOpcode = TargetOpcode::G_ZEXT;
1865     break;
1866   case TargetOpcode::G_USUBO:
1867     Opcode = TargetOpcode::G_SUB;
1868     ExtOpcode = TargetOpcode::G_ZEXT;
1869     break;
1870   case TargetOpcode::G_SADDE:
1871     Opcode = TargetOpcode::G_UADDE;
1872     ExtOpcode = TargetOpcode::G_SEXT;
1873     CarryIn = MI.getOperand(4).getReg();
1874     break;
1875   case TargetOpcode::G_SSUBE:
1876     Opcode = TargetOpcode::G_USUBE;
1877     ExtOpcode = TargetOpcode::G_SEXT;
1878     CarryIn = MI.getOperand(4).getReg();
1879     break;
1880   case TargetOpcode::G_UADDE:
1881     Opcode = TargetOpcode::G_UADDE;
1882     ExtOpcode = TargetOpcode::G_ZEXT;
1883     CarryIn = MI.getOperand(4).getReg();
1884     break;
1885   case TargetOpcode::G_USUBE:
1886     Opcode = TargetOpcode::G_USUBE;
1887     ExtOpcode = TargetOpcode::G_ZEXT;
1888     CarryIn = MI.getOperand(4).getReg();
1889     break;
1890   }
1891 
1892   if (TypeIdx == 1) {
1893     unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
1894 
1895     Observer.changingInstr(MI);
1896     if (CarryIn)
1897       widenScalarSrc(MI, WideTy, 4, BoolExtOp);
1898     widenScalarDst(MI, WideTy, 1);
1899 
1900     Observer.changedInstr(MI);
1901     return Legalized;
1902   }
1903 
1904   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
1905   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
1906   // Do the arithmetic in the larger type.
1907   Register NewOp;
1908   if (CarryIn) {
1909     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
1910     NewOp = MIRBuilder
1911                 .buildInstr(Opcode, {WideTy, CarryOutTy},
1912                             {LHSExt, RHSExt, *CarryIn})
1913                 .getReg(0);
1914   } else {
1915     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
1916   }
1917   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
1918   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
1919   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
1920   // There is no overflow if the ExtOp is the same as NewOp.
1921   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
1922   // Now trunc the NewOp to the original result.
1923   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
1924   MI.eraseFromParent();
1925   return Legalized;
1926 }
1927 
1928 LegalizerHelper::LegalizeResult
1929 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
1930                                          LLT WideTy) {
1931   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
1932                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
1933                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
1934   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
1935                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
1936   // We can convert this to:
1937   //   1. Any extend iN to iM
1938   //   2. SHL by M-N
1939   //   3. [US][ADD|SUB|SHL]SAT
1940   //   4. L/ASHR by M-N
1941   //
1942   // It may be more efficient to lower this to a min and a max operation in
1943   // the higher precision arithmetic if the promoted operation isn't legal,
1944   // but this decision is up to the target's lowering request.
1945   Register DstReg = MI.getOperand(0).getReg();
1946 
1947   unsigned NewBits = WideTy.getScalarSizeInBits();
1948   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
1949 
1950   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
1951   // must not left shift the RHS to preserve the shift amount.
1952   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
1953   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
1954                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
1955   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
1956   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
1957   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
1958 
1959   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
1960                                         {ShiftL, ShiftR}, MI.getFlags());
1961 
1962   // Use a shift that will preserve the number of sign bits when the trunc is
1963   // folded away.
1964   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
1965                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
1966 
1967   MIRBuilder.buildTrunc(DstReg, Result);
1968   MI.eraseFromParent();
1969   return Legalized;
1970 }
1971 
1972 LegalizerHelper::LegalizeResult
1973 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
1974                                  LLT WideTy) {
1975   if (TypeIdx == 1) {
1976     Observer.changingInstr(MI);
1977     widenScalarDst(MI, WideTy, 1);
1978     Observer.changedInstr(MI);
1979     return Legalized;
1980   }
1981 
1982   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
1983   auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
1984   LLT SrcTy = MRI.getType(LHS);
1985   LLT OverflowTy = MRI.getType(OriginalOverflow);
1986   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
1987 
1988   // To determine if the result overflowed in the larger type, we extend the
1989   // input to the larger type, do the multiply (checking if it overflows),
1990   // then also check the high bits of the result to see if overflow happened
1991   // there.
1992   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
1993   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
1994   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
1995 
1996   auto Mulo = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy, OverflowTy},
1997                                     {LeftOperand, RightOperand});
1998   auto Mul = Mulo->getOperand(0);
1999   MIRBuilder.buildTrunc(Result, Mul);
2000 
2001   MachineInstrBuilder ExtResult;
2002   // Overflow occurred if it occurred in the larger type, or if the high part
2003   // of the result does not zero/sign-extend the low part.  Check this second
2004   // possibility first.
2005   if (IsSigned) {
2006     // For signed, overflow occurred when the high part does not sign-extend
2007     // the low part.
2008     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2009   } else {
2010     // Unsigned overflow occurred when the high part does not zero-extend the
2011     // low part.
2012     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2013   }
2014 
2015   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2016   // so we don't need to check the overflow result of larger type Mulo.
2017   if (WideTy.getScalarSizeInBits() < 2 * SrcBitWidth) {
2018     auto Overflow =
2019         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2020     // Finally check if the multiplication in the larger type itself overflowed.
2021     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2022   } else {
2023     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2024   }
2025   MI.eraseFromParent();
2026   return Legalized;
2027 }
2028 
2029 LegalizerHelper::LegalizeResult
2030 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2031   switch (MI.getOpcode()) {
2032   default:
2033     return UnableToLegalize;
2034   case TargetOpcode::G_ATOMICRMW_XCHG:
2035   case TargetOpcode::G_ATOMICRMW_ADD:
2036   case TargetOpcode::G_ATOMICRMW_SUB:
2037   case TargetOpcode::G_ATOMICRMW_AND:
2038   case TargetOpcode::G_ATOMICRMW_OR:
2039   case TargetOpcode::G_ATOMICRMW_XOR:
2040   case TargetOpcode::G_ATOMICRMW_MIN:
2041   case TargetOpcode::G_ATOMICRMW_MAX:
2042   case TargetOpcode::G_ATOMICRMW_UMIN:
2043   case TargetOpcode::G_ATOMICRMW_UMAX:
2044     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2045     Observer.changingInstr(MI);
2046     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2047     widenScalarDst(MI, WideTy, 0);
2048     Observer.changedInstr(MI);
2049     return Legalized;
2050   case TargetOpcode::G_ATOMIC_CMPXCHG:
2051     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2052     Observer.changingInstr(MI);
2053     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2054     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2055     widenScalarDst(MI, WideTy, 0);
2056     Observer.changedInstr(MI);
2057     return Legalized;
2058   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2059     if (TypeIdx == 0) {
2060       Observer.changingInstr(MI);
2061       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2062       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2063       widenScalarDst(MI, WideTy, 0);
2064       Observer.changedInstr(MI);
2065       return Legalized;
2066     }
2067     assert(TypeIdx == 1 &&
2068            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2069     Observer.changingInstr(MI);
2070     widenScalarDst(MI, WideTy, 1);
2071     Observer.changedInstr(MI);
2072     return Legalized;
2073   case TargetOpcode::G_EXTRACT:
2074     return widenScalarExtract(MI, TypeIdx, WideTy);
2075   case TargetOpcode::G_INSERT:
2076     return widenScalarInsert(MI, TypeIdx, WideTy);
2077   case TargetOpcode::G_MERGE_VALUES:
2078     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2079   case TargetOpcode::G_UNMERGE_VALUES:
2080     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2081   case TargetOpcode::G_SADDO:
2082   case TargetOpcode::G_SSUBO:
2083   case TargetOpcode::G_UADDO:
2084   case TargetOpcode::G_USUBO:
2085   case TargetOpcode::G_SADDE:
2086   case TargetOpcode::G_SSUBE:
2087   case TargetOpcode::G_UADDE:
2088   case TargetOpcode::G_USUBE:
2089     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2090   case TargetOpcode::G_UMULO:
2091   case TargetOpcode::G_SMULO:
2092     return widenScalarMulo(MI, TypeIdx, WideTy);
2093   case TargetOpcode::G_SADDSAT:
2094   case TargetOpcode::G_SSUBSAT:
2095   case TargetOpcode::G_SSHLSAT:
2096   case TargetOpcode::G_UADDSAT:
2097   case TargetOpcode::G_USUBSAT:
2098   case TargetOpcode::G_USHLSAT:
2099     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2100   case TargetOpcode::G_CTTZ:
2101   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2102   case TargetOpcode::G_CTLZ:
2103   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2104   case TargetOpcode::G_CTPOP: {
2105     if (TypeIdx == 0) {
2106       Observer.changingInstr(MI);
2107       widenScalarDst(MI, WideTy, 0);
2108       Observer.changedInstr(MI);
2109       return Legalized;
2110     }
2111 
2112     Register SrcReg = MI.getOperand(1).getReg();
2113 
2114     // First extend the input.
2115     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2116                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2117                           ? TargetOpcode::G_ANYEXT
2118                           : TargetOpcode::G_ZEXT;
2119     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2120     LLT CurTy = MRI.getType(SrcReg);
2121     unsigned NewOpc = MI.getOpcode();
2122     if (NewOpc == TargetOpcode::G_CTTZ) {
2123       // The count is the same in the larger type except if the original
2124       // value was zero.  This can be handled by setting the bit just off
2125       // the top of the original type.
2126       auto TopBit =
2127           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2128       MIBSrc = MIRBuilder.buildOr(
2129         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2130       // Now we know the operand is non-zero, use the more relaxed opcode.
2131       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2132     }
2133 
2134     // Perform the operation at the larger size.
2135     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2136     // This is already the correct result for CTPOP and CTTZs
2137     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2138         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2139       // The correct result is NewOp - (Difference in widety and current ty).
2140       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2141       MIBNewOp = MIRBuilder.buildSub(
2142           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2143     }
2144 
2145     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2146     MI.eraseFromParent();
2147     return Legalized;
2148   }
2149   case TargetOpcode::G_BSWAP: {
2150     Observer.changingInstr(MI);
2151     Register DstReg = MI.getOperand(0).getReg();
2152 
2153     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2154     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2155     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2156     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2157 
2158     MI.getOperand(0).setReg(DstExt);
2159 
2160     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2161 
2162     LLT Ty = MRI.getType(DstReg);
2163     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2164     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2165     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2166 
2167     MIRBuilder.buildTrunc(DstReg, ShrReg);
2168     Observer.changedInstr(MI);
2169     return Legalized;
2170   }
2171   case TargetOpcode::G_BITREVERSE: {
2172     Observer.changingInstr(MI);
2173 
2174     Register DstReg = MI.getOperand(0).getReg();
2175     LLT Ty = MRI.getType(DstReg);
2176     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2177 
2178     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2179     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2180     MI.getOperand(0).setReg(DstExt);
2181     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2182 
2183     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2184     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2185     MIRBuilder.buildTrunc(DstReg, Shift);
2186     Observer.changedInstr(MI);
2187     return Legalized;
2188   }
2189   case TargetOpcode::G_FREEZE:
2190     Observer.changingInstr(MI);
2191     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2192     widenScalarDst(MI, WideTy);
2193     Observer.changedInstr(MI);
2194     return Legalized;
2195 
2196   case TargetOpcode::G_ABS:
2197     Observer.changingInstr(MI);
2198     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2199     widenScalarDst(MI, WideTy);
2200     Observer.changedInstr(MI);
2201     return Legalized;
2202 
2203   case TargetOpcode::G_ADD:
2204   case TargetOpcode::G_AND:
2205   case TargetOpcode::G_MUL:
2206   case TargetOpcode::G_OR:
2207   case TargetOpcode::G_XOR:
2208   case TargetOpcode::G_SUB:
2209     // Perform operation at larger width (any extension is fines here, high bits
2210     // don't affect the result) and then truncate the result back to the
2211     // original type.
2212     Observer.changingInstr(MI);
2213     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2214     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2215     widenScalarDst(MI, WideTy);
2216     Observer.changedInstr(MI);
2217     return Legalized;
2218 
2219   case TargetOpcode::G_SBFX:
2220   case TargetOpcode::G_UBFX:
2221     Observer.changingInstr(MI);
2222 
2223     if (TypeIdx == 0) {
2224       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2225       widenScalarDst(MI, WideTy);
2226     } else {
2227       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2228       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2229     }
2230 
2231     Observer.changedInstr(MI);
2232     return Legalized;
2233 
2234   case TargetOpcode::G_SHL:
2235     Observer.changingInstr(MI);
2236 
2237     if (TypeIdx == 0) {
2238       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2239       widenScalarDst(MI, WideTy);
2240     } else {
2241       assert(TypeIdx == 1);
2242       // The "number of bits to shift" operand must preserve its value as an
2243       // unsigned integer:
2244       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2245     }
2246 
2247     Observer.changedInstr(MI);
2248     return Legalized;
2249 
2250   case TargetOpcode::G_SDIV:
2251   case TargetOpcode::G_SREM:
2252   case TargetOpcode::G_SMIN:
2253   case TargetOpcode::G_SMAX:
2254     Observer.changingInstr(MI);
2255     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2256     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2257     widenScalarDst(MI, WideTy);
2258     Observer.changedInstr(MI);
2259     return Legalized;
2260 
2261   case TargetOpcode::G_SDIVREM:
2262     Observer.changingInstr(MI);
2263     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2264     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2265     widenScalarDst(MI, WideTy);
2266     widenScalarDst(MI, WideTy, 1);
2267     Observer.changedInstr(MI);
2268     return Legalized;
2269 
2270   case TargetOpcode::G_ASHR:
2271   case TargetOpcode::G_LSHR:
2272     Observer.changingInstr(MI);
2273 
2274     if (TypeIdx == 0) {
2275       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2276         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2277 
2278       widenScalarSrc(MI, WideTy, 1, CvtOp);
2279       widenScalarDst(MI, WideTy);
2280     } else {
2281       assert(TypeIdx == 1);
2282       // The "number of bits to shift" operand must preserve its value as an
2283       // unsigned integer:
2284       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2285     }
2286 
2287     Observer.changedInstr(MI);
2288     return Legalized;
2289   case TargetOpcode::G_UDIV:
2290   case TargetOpcode::G_UREM:
2291   case TargetOpcode::G_UMIN:
2292   case TargetOpcode::G_UMAX:
2293     Observer.changingInstr(MI);
2294     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2295     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2296     widenScalarDst(MI, WideTy);
2297     Observer.changedInstr(MI);
2298     return Legalized;
2299 
2300   case TargetOpcode::G_UDIVREM:
2301     Observer.changingInstr(MI);
2302     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2303     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2304     widenScalarDst(MI, WideTy);
2305     widenScalarDst(MI, WideTy, 1);
2306     Observer.changedInstr(MI);
2307     return Legalized;
2308 
2309   case TargetOpcode::G_SELECT:
2310     Observer.changingInstr(MI);
2311     if (TypeIdx == 0) {
2312       // Perform operation at larger width (any extension is fine here, high
2313       // bits don't affect the result) and then truncate the result back to the
2314       // original type.
2315       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2316       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2317       widenScalarDst(MI, WideTy);
2318     } else {
2319       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2320       // Explicit extension is required here since high bits affect the result.
2321       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2322     }
2323     Observer.changedInstr(MI);
2324     return Legalized;
2325 
2326   case TargetOpcode::G_FPTOSI:
2327   case TargetOpcode::G_FPTOUI:
2328     Observer.changingInstr(MI);
2329 
2330     if (TypeIdx == 0)
2331       widenScalarDst(MI, WideTy);
2332     else
2333       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2334 
2335     Observer.changedInstr(MI);
2336     return Legalized;
2337   case TargetOpcode::G_SITOFP:
2338     Observer.changingInstr(MI);
2339 
2340     if (TypeIdx == 0)
2341       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2342     else
2343       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2344 
2345     Observer.changedInstr(MI);
2346     return Legalized;
2347   case TargetOpcode::G_UITOFP:
2348     Observer.changingInstr(MI);
2349 
2350     if (TypeIdx == 0)
2351       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2352     else
2353       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2354 
2355     Observer.changedInstr(MI);
2356     return Legalized;
2357   case TargetOpcode::G_LOAD:
2358   case TargetOpcode::G_SEXTLOAD:
2359   case TargetOpcode::G_ZEXTLOAD:
2360     Observer.changingInstr(MI);
2361     widenScalarDst(MI, WideTy);
2362     Observer.changedInstr(MI);
2363     return Legalized;
2364 
2365   case TargetOpcode::G_STORE: {
2366     if (TypeIdx != 0)
2367       return UnableToLegalize;
2368 
2369     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2370     if (!Ty.isScalar())
2371       return UnableToLegalize;
2372 
2373     Observer.changingInstr(MI);
2374 
2375     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2376       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2377     widenScalarSrc(MI, WideTy, 0, ExtType);
2378 
2379     Observer.changedInstr(MI);
2380     return Legalized;
2381   }
2382   case TargetOpcode::G_CONSTANT: {
2383     MachineOperand &SrcMO = MI.getOperand(1);
2384     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2385     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2386         MRI.getType(MI.getOperand(0).getReg()));
2387     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2388             ExtOpc == TargetOpcode::G_ANYEXT) &&
2389            "Illegal Extend");
2390     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2391     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2392                            ? SrcVal.sext(WideTy.getSizeInBits())
2393                            : SrcVal.zext(WideTy.getSizeInBits());
2394     Observer.changingInstr(MI);
2395     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2396 
2397     widenScalarDst(MI, WideTy);
2398     Observer.changedInstr(MI);
2399     return Legalized;
2400   }
2401   case TargetOpcode::G_FCONSTANT: {
2402     // To avoid changing the bits of the constant due to extension to a larger
2403     // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2404     MachineOperand &SrcMO = MI.getOperand(1);
2405     APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2406     MIRBuilder.setInstrAndDebugLoc(MI);
2407     auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
2408     widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
2409     MI.eraseFromParent();
2410     return Legalized;
2411   }
2412   case TargetOpcode::G_IMPLICIT_DEF: {
2413     Observer.changingInstr(MI);
2414     widenScalarDst(MI, WideTy);
2415     Observer.changedInstr(MI);
2416     return Legalized;
2417   }
2418   case TargetOpcode::G_BRCOND:
2419     Observer.changingInstr(MI);
2420     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2421     Observer.changedInstr(MI);
2422     return Legalized;
2423 
2424   case TargetOpcode::G_FCMP:
2425     Observer.changingInstr(MI);
2426     if (TypeIdx == 0)
2427       widenScalarDst(MI, WideTy);
2428     else {
2429       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2430       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2431     }
2432     Observer.changedInstr(MI);
2433     return Legalized;
2434 
2435   case TargetOpcode::G_ICMP:
2436     Observer.changingInstr(MI);
2437     if (TypeIdx == 0)
2438       widenScalarDst(MI, WideTy);
2439     else {
2440       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2441                                MI.getOperand(1).getPredicate()))
2442                                ? TargetOpcode::G_SEXT
2443                                : TargetOpcode::G_ZEXT;
2444       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2445       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2446     }
2447     Observer.changedInstr(MI);
2448     return Legalized;
2449 
2450   case TargetOpcode::G_PTR_ADD:
2451     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2452     Observer.changingInstr(MI);
2453     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2454     Observer.changedInstr(MI);
2455     return Legalized;
2456 
2457   case TargetOpcode::G_PHI: {
2458     assert(TypeIdx == 0 && "Expecting only Idx 0");
2459 
2460     Observer.changingInstr(MI);
2461     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2462       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2463       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
2464       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2465     }
2466 
2467     MachineBasicBlock &MBB = *MI.getParent();
2468     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2469     widenScalarDst(MI, WideTy);
2470     Observer.changedInstr(MI);
2471     return Legalized;
2472   }
2473   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2474     if (TypeIdx == 0) {
2475       Register VecReg = MI.getOperand(1).getReg();
2476       LLT VecTy = MRI.getType(VecReg);
2477       Observer.changingInstr(MI);
2478 
2479       widenScalarSrc(
2480           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2481           TargetOpcode::G_ANYEXT);
2482 
2483       widenScalarDst(MI, WideTy, 0);
2484       Observer.changedInstr(MI);
2485       return Legalized;
2486     }
2487 
2488     if (TypeIdx != 2)
2489       return UnableToLegalize;
2490     Observer.changingInstr(MI);
2491     // TODO: Probably should be zext
2492     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2493     Observer.changedInstr(MI);
2494     return Legalized;
2495   }
2496   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2497     if (TypeIdx == 1) {
2498       Observer.changingInstr(MI);
2499 
2500       Register VecReg = MI.getOperand(1).getReg();
2501       LLT VecTy = MRI.getType(VecReg);
2502       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2503 
2504       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2505       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2506       widenScalarDst(MI, WideVecTy, 0);
2507       Observer.changedInstr(MI);
2508       return Legalized;
2509     }
2510 
2511     if (TypeIdx == 2) {
2512       Observer.changingInstr(MI);
2513       // TODO: Probably should be zext
2514       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2515       Observer.changedInstr(MI);
2516       return Legalized;
2517     }
2518 
2519     return UnableToLegalize;
2520   }
2521   case TargetOpcode::G_FADD:
2522   case TargetOpcode::G_FMUL:
2523   case TargetOpcode::G_FSUB:
2524   case TargetOpcode::G_FMA:
2525   case TargetOpcode::G_FMAD:
2526   case TargetOpcode::G_FNEG:
2527   case TargetOpcode::G_FABS:
2528   case TargetOpcode::G_FCANONICALIZE:
2529   case TargetOpcode::G_FMINNUM:
2530   case TargetOpcode::G_FMAXNUM:
2531   case TargetOpcode::G_FMINNUM_IEEE:
2532   case TargetOpcode::G_FMAXNUM_IEEE:
2533   case TargetOpcode::G_FMINIMUM:
2534   case TargetOpcode::G_FMAXIMUM:
2535   case TargetOpcode::G_FDIV:
2536   case TargetOpcode::G_FREM:
2537   case TargetOpcode::G_FCEIL:
2538   case TargetOpcode::G_FFLOOR:
2539   case TargetOpcode::G_FCOS:
2540   case TargetOpcode::G_FSIN:
2541   case TargetOpcode::G_FLOG10:
2542   case TargetOpcode::G_FLOG:
2543   case TargetOpcode::G_FLOG2:
2544   case TargetOpcode::G_FRINT:
2545   case TargetOpcode::G_FNEARBYINT:
2546   case TargetOpcode::G_FSQRT:
2547   case TargetOpcode::G_FEXP:
2548   case TargetOpcode::G_FEXP2:
2549   case TargetOpcode::G_FPOW:
2550   case TargetOpcode::G_INTRINSIC_TRUNC:
2551   case TargetOpcode::G_INTRINSIC_ROUND:
2552   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2553     assert(TypeIdx == 0);
2554     Observer.changingInstr(MI);
2555 
2556     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2557       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2558 
2559     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2560     Observer.changedInstr(MI);
2561     return Legalized;
2562   case TargetOpcode::G_FPOWI:
2563   case TargetOpcode::G_FLDEXP:
2564   case TargetOpcode::G_STRICT_FLDEXP: {
2565     if (TypeIdx == 0) {
2566       if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
2567         return UnableToLegalize;
2568 
2569       Observer.changingInstr(MI);
2570       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2571       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2572       Observer.changedInstr(MI);
2573       return Legalized;
2574     }
2575 
2576     if (TypeIdx == 1) {
2577       // For some reason SelectionDAG tries to promote to a libcall without
2578       // actually changing the integer type for promotion.
2579       Observer.changingInstr(MI);
2580       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2581       Observer.changedInstr(MI);
2582       return Legalized;
2583     }
2584 
2585     return UnableToLegalize;
2586   }
2587   case TargetOpcode::G_FFREXP: {
2588     Observer.changingInstr(MI);
2589 
2590     if (TypeIdx == 0) {
2591       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2592       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2593     } else {
2594       widenScalarDst(MI, WideTy, 1);
2595     }
2596 
2597     Observer.changedInstr(MI);
2598     return Legalized;
2599   }
2600   case TargetOpcode::G_INTTOPTR:
2601     if (TypeIdx != 1)
2602       return UnableToLegalize;
2603 
2604     Observer.changingInstr(MI);
2605     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2606     Observer.changedInstr(MI);
2607     return Legalized;
2608   case TargetOpcode::G_PTRTOINT:
2609     if (TypeIdx != 0)
2610       return UnableToLegalize;
2611 
2612     Observer.changingInstr(MI);
2613     widenScalarDst(MI, WideTy, 0);
2614     Observer.changedInstr(MI);
2615     return Legalized;
2616   case TargetOpcode::G_BUILD_VECTOR: {
2617     Observer.changingInstr(MI);
2618 
2619     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2620     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2621       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2622 
2623     // Avoid changing the result vector type if the source element type was
2624     // requested.
2625     if (TypeIdx == 1) {
2626       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2627     } else {
2628       widenScalarDst(MI, WideTy, 0);
2629     }
2630 
2631     Observer.changedInstr(MI);
2632     return Legalized;
2633   }
2634   case TargetOpcode::G_SEXT_INREG:
2635     if (TypeIdx != 0)
2636       return UnableToLegalize;
2637 
2638     Observer.changingInstr(MI);
2639     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2640     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2641     Observer.changedInstr(MI);
2642     return Legalized;
2643   case TargetOpcode::G_PTRMASK: {
2644     if (TypeIdx != 1)
2645       return UnableToLegalize;
2646     Observer.changingInstr(MI);
2647     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2648     Observer.changedInstr(MI);
2649     return Legalized;
2650   }
2651   }
2652 }
2653 
2654 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2655                              MachineIRBuilder &B, Register Src, LLT Ty) {
2656   auto Unmerge = B.buildUnmerge(Ty, Src);
2657   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2658     Pieces.push_back(Unmerge.getReg(I));
2659 }
2660 
2661 LegalizerHelper::LegalizeResult
2662 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
2663   Register Dst = MI.getOperand(0).getReg();
2664 
2665   MachineFunction &MF = MIRBuilder.getMF();
2666   const DataLayout &DL = MIRBuilder.getDataLayout();
2667 
2668   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
2669   LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
2670   Align Alignment = Align(DL.getABITypeAlign(
2671       getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst))));
2672 
2673   auto Addr = MIRBuilder.buildConstantPool(
2674       AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex(
2675                      MI.getOperand(1).getFPImm(), Alignment));
2676 
2677   MachineMemOperand *MMO = MF.getMachineMemOperand(
2678       MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
2679       MRI.getType(Dst), Alignment);
2680 
2681   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO);
2682   MI.eraseFromParent();
2683 
2684   return Legalized;
2685 }
2686 
2687 LegalizerHelper::LegalizeResult
2688 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
2689   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
2690   if (SrcTy.isVector()) {
2691     LLT SrcEltTy = SrcTy.getElementType();
2692     SmallVector<Register, 8> SrcRegs;
2693 
2694     if (DstTy.isVector()) {
2695       int NumDstElt = DstTy.getNumElements();
2696       int NumSrcElt = SrcTy.getNumElements();
2697 
2698       LLT DstEltTy = DstTy.getElementType();
2699       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
2700       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
2701 
2702       // If there's an element size mismatch, insert intermediate casts to match
2703       // the result element type.
2704       if (NumSrcElt < NumDstElt) { // Source element type is larger.
2705         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
2706         //
2707         // =>
2708         //
2709         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
2710         // %3:_(<2 x s8>) = G_BITCAST %2
2711         // %4:_(<2 x s8>) = G_BITCAST %3
2712         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
2713         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
2714         SrcPartTy = SrcEltTy;
2715       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
2716         //
2717         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
2718         //
2719         // =>
2720         //
2721         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
2722         // %3:_(s16) = G_BITCAST %2
2723         // %4:_(s16) = G_BITCAST %3
2724         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
2725         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
2726         DstCastTy = DstEltTy;
2727       }
2728 
2729       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
2730       for (Register &SrcReg : SrcRegs)
2731         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
2732     } else
2733       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
2734 
2735     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
2736     MI.eraseFromParent();
2737     return Legalized;
2738   }
2739 
2740   if (DstTy.isVector()) {
2741     SmallVector<Register, 8> SrcRegs;
2742     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
2743     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
2744     MI.eraseFromParent();
2745     return Legalized;
2746   }
2747 
2748   return UnableToLegalize;
2749 }
2750 
2751 /// Figure out the bit offset into a register when coercing a vector index for
2752 /// the wide element type. This is only for the case when promoting vector to
2753 /// one with larger elements.
2754 //
2755 ///
2756 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2757 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2758 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
2759                                                    Register Idx,
2760                                                    unsigned NewEltSize,
2761                                                    unsigned OldEltSize) {
2762   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2763   LLT IdxTy = B.getMRI()->getType(Idx);
2764 
2765   // Now figure out the amount we need to shift to get the target bits.
2766   auto OffsetMask = B.buildConstant(
2767       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
2768   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
2769   return B.buildShl(IdxTy, OffsetIdx,
2770                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
2771 }
2772 
2773 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
2774 /// is casting to a vector with a smaller element size, perform multiple element
2775 /// extracts and merge the results. If this is coercing to a vector with larger
2776 /// elements, index the bitcasted vector and extract the target element with bit
2777 /// operations. This is intended to force the indexing in the native register
2778 /// size for architectures that can dynamically index the register file.
2779 LegalizerHelper::LegalizeResult
2780 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
2781                                          LLT CastTy) {
2782   if (TypeIdx != 1)
2783     return UnableToLegalize;
2784 
2785   auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
2786 
2787   LLT SrcEltTy = SrcVecTy.getElementType();
2788   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2789   unsigned OldNumElts = SrcVecTy.getNumElements();
2790 
2791   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2792   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2793 
2794   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2795   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
2796   if (NewNumElts > OldNumElts) {
2797     // Decreasing the vector element size
2798     //
2799     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
2800     //  =>
2801     //  v4i32:castx = bitcast x:v2i64
2802     //
2803     // i64 = bitcast
2804     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
2805     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
2806     //
2807     if (NewNumElts % OldNumElts != 0)
2808       return UnableToLegalize;
2809 
2810     // Type of the intermediate result vector.
2811     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
2812     LLT MidTy =
2813         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
2814 
2815     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
2816 
2817     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
2818     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
2819 
2820     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
2821       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
2822       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
2823       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
2824       NewOps[I] = Elt.getReg(0);
2825     }
2826 
2827     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
2828     MIRBuilder.buildBitcast(Dst, NewVec);
2829     MI.eraseFromParent();
2830     return Legalized;
2831   }
2832 
2833   if (NewNumElts < OldNumElts) {
2834     if (NewEltSize % OldEltSize != 0)
2835       return UnableToLegalize;
2836 
2837     // This only depends on powers of 2 because we use bit tricks to figure out
2838     // the bit offset we need to shift to get the target element. A general
2839     // expansion could emit division/multiply.
2840     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2841       return UnableToLegalize;
2842 
2843     // Increasing the vector element size.
2844     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
2845     //
2846     //   =>
2847     //
2848     // %cast = G_BITCAST %vec
2849     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
2850     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
2851     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
2852     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
2853     // %elt_bits = G_LSHR %wide_elt, %offset_bits
2854     // %elt = G_TRUNC %elt_bits
2855 
2856     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2857     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2858 
2859     // Divide to get the index in the wider element type.
2860     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2861 
2862     Register WideElt = CastVec;
2863     if (CastTy.isVector()) {
2864       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2865                                                      ScaledIdx).getReg(0);
2866     }
2867 
2868     // Compute the bit offset into the register of the target element.
2869     Register OffsetBits = getBitcastWiderVectorElementOffset(
2870       MIRBuilder, Idx, NewEltSize, OldEltSize);
2871 
2872     // Shift the wide element to get the target element.
2873     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
2874     MIRBuilder.buildTrunc(Dst, ExtractedBits);
2875     MI.eraseFromParent();
2876     return Legalized;
2877   }
2878 
2879   return UnableToLegalize;
2880 }
2881 
2882 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
2883 /// TargetReg, while preserving other bits in \p TargetReg.
2884 ///
2885 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
2886 static Register buildBitFieldInsert(MachineIRBuilder &B,
2887                                     Register TargetReg, Register InsertReg,
2888                                     Register OffsetBits) {
2889   LLT TargetTy = B.getMRI()->getType(TargetReg);
2890   LLT InsertTy = B.getMRI()->getType(InsertReg);
2891   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
2892   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
2893 
2894   // Produce a bitmask of the value to insert
2895   auto EltMask = B.buildConstant(
2896     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
2897                                    InsertTy.getSizeInBits()));
2898   // Shift it into position
2899   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
2900   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
2901 
2902   // Clear out the bits in the wide element
2903   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
2904 
2905   // The value to insert has all zeros already, so stick it into the masked
2906   // wide element.
2907   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
2908 }
2909 
2910 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
2911 /// is increasing the element size, perform the indexing in the target element
2912 /// type, and use bit operations to insert at the element position. This is
2913 /// intended for architectures that can dynamically index the register file and
2914 /// want to force indexing in the native register size.
2915 LegalizerHelper::LegalizeResult
2916 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
2917                                         LLT CastTy) {
2918   if (TypeIdx != 0)
2919     return UnableToLegalize;
2920 
2921   auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
2922       MI.getFirst4RegLLTs();
2923   LLT VecTy = DstTy;
2924 
2925   LLT VecEltTy = VecTy.getElementType();
2926   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
2927   const unsigned NewEltSize = NewEltTy.getSizeInBits();
2928   const unsigned OldEltSize = VecEltTy.getSizeInBits();
2929 
2930   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
2931   unsigned OldNumElts = VecTy.getNumElements();
2932 
2933   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
2934   if (NewNumElts < OldNumElts) {
2935     if (NewEltSize % OldEltSize != 0)
2936       return UnableToLegalize;
2937 
2938     // This only depends on powers of 2 because we use bit tricks to figure out
2939     // the bit offset we need to shift to get the target element. A general
2940     // expansion could emit division/multiply.
2941     if (!isPowerOf2_32(NewEltSize / OldEltSize))
2942       return UnableToLegalize;
2943 
2944     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
2945     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
2946 
2947     // Divide to get the index in the wider element type.
2948     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
2949 
2950     Register ExtractedElt = CastVec;
2951     if (CastTy.isVector()) {
2952       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
2953                                                           ScaledIdx).getReg(0);
2954     }
2955 
2956     // Compute the bit offset into the register of the target element.
2957     Register OffsetBits = getBitcastWiderVectorElementOffset(
2958       MIRBuilder, Idx, NewEltSize, OldEltSize);
2959 
2960     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
2961                                                Val, OffsetBits);
2962     if (CastTy.isVector()) {
2963       InsertedElt = MIRBuilder.buildInsertVectorElement(
2964         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
2965     }
2966 
2967     MIRBuilder.buildBitcast(Dst, InsertedElt);
2968     MI.eraseFromParent();
2969     return Legalized;
2970   }
2971 
2972   return UnableToLegalize;
2973 }
2974 
2975 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
2976   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
2977   Register DstReg = LoadMI.getDstReg();
2978   Register PtrReg = LoadMI.getPointerReg();
2979   LLT DstTy = MRI.getType(DstReg);
2980   MachineMemOperand &MMO = LoadMI.getMMO();
2981   LLT MemTy = MMO.getMemoryType();
2982   MachineFunction &MF = MIRBuilder.getMF();
2983 
2984   unsigned MemSizeInBits = MemTy.getSizeInBits();
2985   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
2986 
2987   if (MemSizeInBits != MemStoreSizeInBits) {
2988     if (MemTy.isVector())
2989       return UnableToLegalize;
2990 
2991     // Promote to a byte-sized load if not loading an integral number of
2992     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
2993     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
2994     MachineMemOperand *NewMMO =
2995         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
2996 
2997     Register LoadReg = DstReg;
2998     LLT LoadTy = DstTy;
2999 
3000     // If this wasn't already an extending load, we need to widen the result
3001     // register to avoid creating a load with a narrower result than the source.
3002     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
3003       LoadTy = WideMemTy;
3004       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
3005     }
3006 
3007     if (isa<GSExtLoad>(LoadMI)) {
3008       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3009       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
3010     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
3011       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3012       // The extra bits are guaranteed to be zero, since we stored them that
3013       // way.  A zext load from Wide thus automatically gives zext from MemVT.
3014       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3015     } else {
3016       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3017     }
3018 
3019     if (DstTy != LoadTy)
3020       MIRBuilder.buildTrunc(DstReg, LoadReg);
3021 
3022     LoadMI.eraseFromParent();
3023     return Legalized;
3024   }
3025 
3026   // Big endian lowering not implemented.
3027   if (MIRBuilder.getDataLayout().isBigEndian())
3028     return UnableToLegalize;
3029 
3030   // This load needs splitting into power of 2 sized loads.
3031   //
3032   // Our strategy here is to generate anyextending loads for the smaller
3033   // types up to next power-2 result type, and then combine the two larger
3034   // result values together, before truncating back down to the non-pow-2
3035   // type.
3036   // E.g. v1 = i24 load =>
3037   // v2 = i32 zextload (2 byte)
3038   // v3 = i32 load (1 byte)
3039   // v4 = i32 shl v3, 16
3040   // v5 = i32 or v4, v2
3041   // v1 = i24 trunc v5
3042   // By doing this we generate the correct truncate which should get
3043   // combined away as an artifact with a matching extend.
3044 
3045   uint64_t LargeSplitSize, SmallSplitSize;
3046 
3047   if (!isPowerOf2_32(MemSizeInBits)) {
3048     // This load needs splitting into power of 2 sized loads.
3049     LargeSplitSize = llvm::bit_floor(MemSizeInBits);
3050     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3051   } else {
3052     // This is already a power of 2, but we still need to split this in half.
3053     //
3054     // Assume we're being asked to decompose an unaligned load.
3055     // TODO: If this requires multiple splits, handle them all at once.
3056     auto &Ctx = MF.getFunction().getContext();
3057     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3058       return UnableToLegalize;
3059 
3060     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3061   }
3062 
3063   if (MemTy.isVector()) {
3064     // TODO: Handle vector extloads
3065     if (MemTy != DstTy)
3066       return UnableToLegalize;
3067 
3068     // TODO: We can do better than scalarizing the vector and at least split it
3069     // in half.
3070     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3071   }
3072 
3073   MachineMemOperand *LargeMMO =
3074       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3075   MachineMemOperand *SmallMMO =
3076       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3077 
3078   LLT PtrTy = MRI.getType(PtrReg);
3079   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3080   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3081   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3082                                              PtrReg, *LargeMMO);
3083 
3084   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3085                                             LargeSplitSize / 8);
3086   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3087   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3088   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3089                                              SmallPtr, *SmallMMO);
3090 
3091   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3092   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3093 
3094   if (AnyExtTy == DstTy)
3095     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3096   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3097     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3098     MIRBuilder.buildTrunc(DstReg, {Or});
3099   } else {
3100     assert(DstTy.isPointer() && "expected pointer");
3101     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3102 
3103     // FIXME: We currently consider this to be illegal for non-integral address
3104     // spaces, but we need still need a way to reinterpret the bits.
3105     MIRBuilder.buildIntToPtr(DstReg, Or);
3106   }
3107 
3108   LoadMI.eraseFromParent();
3109   return Legalized;
3110 }
3111 
3112 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3113   // Lower a non-power of 2 store into multiple pow-2 stores.
3114   // E.g. split an i24 store into an i16 store + i8 store.
3115   // We do this by first extending the stored value to the next largest power
3116   // of 2 type, and then using truncating stores to store the components.
3117   // By doing this, likewise with G_LOAD, generate an extend that can be
3118   // artifact-combined away instead of leaving behind extracts.
3119   Register SrcReg = StoreMI.getValueReg();
3120   Register PtrReg = StoreMI.getPointerReg();
3121   LLT SrcTy = MRI.getType(SrcReg);
3122   MachineFunction &MF = MIRBuilder.getMF();
3123   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3124   LLT MemTy = MMO.getMemoryType();
3125 
3126   unsigned StoreWidth = MemTy.getSizeInBits();
3127   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3128 
3129   if (StoreWidth != StoreSizeInBits) {
3130     if (SrcTy.isVector())
3131       return UnableToLegalize;
3132 
3133     // Promote to a byte-sized store with upper bits zero if not
3134     // storing an integral number of bytes.  For example, promote
3135     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3136     LLT WideTy = LLT::scalar(StoreSizeInBits);
3137 
3138     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3139       // Avoid creating a store with a narrower source than result.
3140       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3141       SrcTy = WideTy;
3142     }
3143 
3144     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3145 
3146     MachineMemOperand *NewMMO =
3147         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3148     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3149     StoreMI.eraseFromParent();
3150     return Legalized;
3151   }
3152 
3153   if (MemTy.isVector()) {
3154     // TODO: Handle vector trunc stores
3155     if (MemTy != SrcTy)
3156       return UnableToLegalize;
3157 
3158     // TODO: We can do better than scalarizing the vector and at least split it
3159     // in half.
3160     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3161   }
3162 
3163   unsigned MemSizeInBits = MemTy.getSizeInBits();
3164   uint64_t LargeSplitSize, SmallSplitSize;
3165 
3166   if (!isPowerOf2_32(MemSizeInBits)) {
3167     LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
3168     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3169   } else {
3170     auto &Ctx = MF.getFunction().getContext();
3171     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3172       return UnableToLegalize; // Don't know what we're being asked to do.
3173 
3174     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3175   }
3176 
3177   // Extend to the next pow-2. If this store was itself the result of lowering,
3178   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3179   // that's wider than the stored size.
3180   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3181   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3182 
3183   if (SrcTy.isPointer()) {
3184     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3185     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3186   }
3187 
3188   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3189 
3190   // Obtain the smaller value by shifting away the larger value.
3191   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3192   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3193 
3194   // Generate the PtrAdd and truncating stores.
3195   LLT PtrTy = MRI.getType(PtrReg);
3196   auto OffsetCst = MIRBuilder.buildConstant(
3197     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3198   auto SmallPtr =
3199     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3200 
3201   MachineMemOperand *LargeMMO =
3202     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3203   MachineMemOperand *SmallMMO =
3204     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3205   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3206   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3207   StoreMI.eraseFromParent();
3208   return Legalized;
3209 }
3210 
3211 LegalizerHelper::LegalizeResult
3212 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3213   switch (MI.getOpcode()) {
3214   case TargetOpcode::G_LOAD: {
3215     if (TypeIdx != 0)
3216       return UnableToLegalize;
3217     MachineMemOperand &MMO = **MI.memoperands_begin();
3218 
3219     // Not sure how to interpret a bitcast of an extending load.
3220     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3221       return UnableToLegalize;
3222 
3223     Observer.changingInstr(MI);
3224     bitcastDst(MI, CastTy, 0);
3225     MMO.setType(CastTy);
3226     Observer.changedInstr(MI);
3227     return Legalized;
3228   }
3229   case TargetOpcode::G_STORE: {
3230     if (TypeIdx != 0)
3231       return UnableToLegalize;
3232 
3233     MachineMemOperand &MMO = **MI.memoperands_begin();
3234 
3235     // Not sure how to interpret a bitcast of a truncating store.
3236     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3237       return UnableToLegalize;
3238 
3239     Observer.changingInstr(MI);
3240     bitcastSrc(MI, CastTy, 0);
3241     MMO.setType(CastTy);
3242     Observer.changedInstr(MI);
3243     return Legalized;
3244   }
3245   case TargetOpcode::G_SELECT: {
3246     if (TypeIdx != 0)
3247       return UnableToLegalize;
3248 
3249     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3250       LLVM_DEBUG(
3251           dbgs() << "bitcast action not implemented for vector select\n");
3252       return UnableToLegalize;
3253     }
3254 
3255     Observer.changingInstr(MI);
3256     bitcastSrc(MI, CastTy, 2);
3257     bitcastSrc(MI, CastTy, 3);
3258     bitcastDst(MI, CastTy, 0);
3259     Observer.changedInstr(MI);
3260     return Legalized;
3261   }
3262   case TargetOpcode::G_AND:
3263   case TargetOpcode::G_OR:
3264   case TargetOpcode::G_XOR: {
3265     Observer.changingInstr(MI);
3266     bitcastSrc(MI, CastTy, 1);
3267     bitcastSrc(MI, CastTy, 2);
3268     bitcastDst(MI, CastTy, 0);
3269     Observer.changedInstr(MI);
3270     return Legalized;
3271   }
3272   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3273     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3274   case TargetOpcode::G_INSERT_VECTOR_ELT:
3275     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3276   default:
3277     return UnableToLegalize;
3278   }
3279 }
3280 
3281 // Legalize an instruction by changing the opcode in place.
3282 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3283     Observer.changingInstr(MI);
3284     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3285     Observer.changedInstr(MI);
3286 }
3287 
3288 LegalizerHelper::LegalizeResult
3289 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3290   using namespace TargetOpcode;
3291 
3292   switch(MI.getOpcode()) {
3293   default:
3294     return UnableToLegalize;
3295   case TargetOpcode::G_FCONSTANT:
3296     return lowerFConstant(MI);
3297   case TargetOpcode::G_BITCAST:
3298     return lowerBitcast(MI);
3299   case TargetOpcode::G_SREM:
3300   case TargetOpcode::G_UREM: {
3301     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3302     auto Quot =
3303         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3304                               {MI.getOperand(1), MI.getOperand(2)});
3305 
3306     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3307     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3308     MI.eraseFromParent();
3309     return Legalized;
3310   }
3311   case TargetOpcode::G_SADDO:
3312   case TargetOpcode::G_SSUBO:
3313     return lowerSADDO_SSUBO(MI);
3314   case TargetOpcode::G_UMULH:
3315   case TargetOpcode::G_SMULH:
3316     return lowerSMULH_UMULH(MI);
3317   case TargetOpcode::G_SMULO:
3318   case TargetOpcode::G_UMULO: {
3319     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3320     // result.
3321     auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
3322     LLT Ty = MRI.getType(Res);
3323 
3324     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3325                           ? TargetOpcode::G_SMULH
3326                           : TargetOpcode::G_UMULH;
3327 
3328     Observer.changingInstr(MI);
3329     const auto &TII = MIRBuilder.getTII();
3330     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3331     MI.removeOperand(1);
3332     Observer.changedInstr(MI);
3333 
3334     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3335     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3336 
3337     // Move insert point forward so we can use the Res register if needed.
3338     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3339 
3340     // For *signed* multiply, overflow is detected by checking:
3341     // (hi != (lo >> bitwidth-1))
3342     if (Opcode == TargetOpcode::G_SMULH) {
3343       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3344       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3345       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3346     } else {
3347       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3348     }
3349     return Legalized;
3350   }
3351   case TargetOpcode::G_FNEG: {
3352     auto [Res, SubByReg] = MI.getFirst2Regs();
3353     LLT Ty = MRI.getType(Res);
3354 
3355     // TODO: Handle vector types once we are able to
3356     // represent them.
3357     if (Ty.isVector())
3358       return UnableToLegalize;
3359     auto SignMask =
3360         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3361     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3362     MI.eraseFromParent();
3363     return Legalized;
3364   }
3365   case TargetOpcode::G_FSUB:
3366   case TargetOpcode::G_STRICT_FSUB: {
3367     auto [Res, LHS, RHS] = MI.getFirst3Regs();
3368     LLT Ty = MRI.getType(Res);
3369 
3370     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3371     auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
3372 
3373     if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
3374       MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
3375     else
3376       MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3377 
3378     MI.eraseFromParent();
3379     return Legalized;
3380   }
3381   case TargetOpcode::G_FMAD:
3382     return lowerFMad(MI);
3383   case TargetOpcode::G_FFLOOR:
3384     return lowerFFloor(MI);
3385   case TargetOpcode::G_INTRINSIC_ROUND:
3386     return lowerIntrinsicRound(MI);
3387   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
3388     // Since round even is the assumed rounding mode for unconstrained FP
3389     // operations, rint and roundeven are the same operation.
3390     changeOpcode(MI, TargetOpcode::G_FRINT);
3391     return Legalized;
3392   }
3393   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3394     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
3395     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3396                                   **MI.memoperands_begin());
3397     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3398     MI.eraseFromParent();
3399     return Legalized;
3400   }
3401   case TargetOpcode::G_LOAD:
3402   case TargetOpcode::G_SEXTLOAD:
3403   case TargetOpcode::G_ZEXTLOAD:
3404     return lowerLoad(cast<GAnyLoad>(MI));
3405   case TargetOpcode::G_STORE:
3406     return lowerStore(cast<GStore>(MI));
3407   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3408   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3409   case TargetOpcode::G_CTLZ:
3410   case TargetOpcode::G_CTTZ:
3411   case TargetOpcode::G_CTPOP:
3412     return lowerBitCount(MI);
3413   case G_UADDO: {
3414     auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
3415 
3416     MIRBuilder.buildAdd(Res, LHS, RHS);
3417     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3418 
3419     MI.eraseFromParent();
3420     return Legalized;
3421   }
3422   case G_UADDE: {
3423     auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
3424     LLT Ty = MRI.getType(Res);
3425 
3426     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3427     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3428     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3429     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
3430 
3431     MI.eraseFromParent();
3432     return Legalized;
3433   }
3434   case G_USUBO: {
3435     auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
3436 
3437     MIRBuilder.buildSub(Res, LHS, RHS);
3438     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3439 
3440     MI.eraseFromParent();
3441     return Legalized;
3442   }
3443   case G_USUBE: {
3444     auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
3445     const LLT CondTy = MRI.getType(BorrowOut);
3446     const LLT Ty = MRI.getType(Res);
3447 
3448     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3449     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3450     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3451 
3452     auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
3453     auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
3454     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
3455 
3456     MI.eraseFromParent();
3457     return Legalized;
3458   }
3459   case G_UITOFP:
3460     return lowerUITOFP(MI);
3461   case G_SITOFP:
3462     return lowerSITOFP(MI);
3463   case G_FPTOUI:
3464     return lowerFPTOUI(MI);
3465   case G_FPTOSI:
3466     return lowerFPTOSI(MI);
3467   case G_FPTRUNC:
3468     return lowerFPTRUNC(MI);
3469   case G_FPOWI:
3470     return lowerFPOWI(MI);
3471   case G_SMIN:
3472   case G_SMAX:
3473   case G_UMIN:
3474   case G_UMAX:
3475     return lowerMinMax(MI);
3476   case G_FCOPYSIGN:
3477     return lowerFCopySign(MI);
3478   case G_FMINNUM:
3479   case G_FMAXNUM:
3480     return lowerFMinNumMaxNum(MI);
3481   case G_MERGE_VALUES:
3482     return lowerMergeValues(MI);
3483   case G_UNMERGE_VALUES:
3484     return lowerUnmergeValues(MI);
3485   case TargetOpcode::G_SEXT_INREG: {
3486     assert(MI.getOperand(2).isImm() && "Expected immediate");
3487     int64_t SizeInBits = MI.getOperand(2).getImm();
3488 
3489     auto [DstReg, SrcReg] = MI.getFirst2Regs();
3490     LLT DstTy = MRI.getType(DstReg);
3491     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3492 
3493     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3494     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3495     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3496     MI.eraseFromParent();
3497     return Legalized;
3498   }
3499   case G_EXTRACT_VECTOR_ELT:
3500   case G_INSERT_VECTOR_ELT:
3501     return lowerExtractInsertVectorElt(MI);
3502   case G_SHUFFLE_VECTOR:
3503     return lowerShuffleVector(MI);
3504   case G_DYN_STACKALLOC:
3505     return lowerDynStackAlloc(MI);
3506   case G_EXTRACT:
3507     return lowerExtract(MI);
3508   case G_INSERT:
3509     return lowerInsert(MI);
3510   case G_BSWAP:
3511     return lowerBswap(MI);
3512   case G_BITREVERSE:
3513     return lowerBitreverse(MI);
3514   case G_READ_REGISTER:
3515   case G_WRITE_REGISTER:
3516     return lowerReadWriteRegister(MI);
3517   case G_UADDSAT:
3518   case G_USUBSAT: {
3519     // Try to make a reasonable guess about which lowering strategy to use. The
3520     // target can override this with custom lowering and calling the
3521     // implementation functions.
3522     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3523     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3524       return lowerAddSubSatToMinMax(MI);
3525     return lowerAddSubSatToAddoSubo(MI);
3526   }
3527   case G_SADDSAT:
3528   case G_SSUBSAT: {
3529     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3530 
3531     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3532     // since it's a shorter expansion. However, we would need to figure out the
3533     // preferred boolean type for the carry out for the query.
3534     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3535       return lowerAddSubSatToMinMax(MI);
3536     return lowerAddSubSatToAddoSubo(MI);
3537   }
3538   case G_SSHLSAT:
3539   case G_USHLSAT:
3540     return lowerShlSat(MI);
3541   case G_ABS:
3542     return lowerAbsToAddXor(MI);
3543   case G_SELECT:
3544     return lowerSelect(MI);
3545   case G_IS_FPCLASS:
3546     return lowerISFPCLASS(MI);
3547   case G_SDIVREM:
3548   case G_UDIVREM:
3549     return lowerDIVREM(MI);
3550   case G_FSHL:
3551   case G_FSHR:
3552     return lowerFunnelShift(MI);
3553   case G_ROTL:
3554   case G_ROTR:
3555     return lowerRotate(MI);
3556   case G_MEMSET:
3557   case G_MEMCPY:
3558   case G_MEMMOVE:
3559     return lowerMemCpyFamily(MI);
3560   case G_MEMCPY_INLINE:
3561     return lowerMemcpyInline(MI);
3562   GISEL_VECREDUCE_CASES_NONSEQ
3563     return lowerVectorReduction(MI);
3564   }
3565 }
3566 
3567 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3568                                                   Align MinAlign) const {
3569   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3570   // datalayout for the preferred alignment. Also there should be a target hook
3571   // for this to allow targets to reduce the alignment and ignore the
3572   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3573   // the type.
3574   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3575 }
3576 
3577 MachineInstrBuilder
3578 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3579                                       MachinePointerInfo &PtrInfo) {
3580   MachineFunction &MF = MIRBuilder.getMF();
3581   const DataLayout &DL = MIRBuilder.getDataLayout();
3582   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3583 
3584   unsigned AddrSpace = DL.getAllocaAddrSpace();
3585   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3586 
3587   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3588   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3589 }
3590 
3591 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3592                                         LLT VecTy) {
3593   int64_t IdxVal;
3594   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3595     return IdxReg;
3596 
3597   LLT IdxTy = B.getMRI()->getType(IdxReg);
3598   unsigned NElts = VecTy.getNumElements();
3599   if (isPowerOf2_32(NElts)) {
3600     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3601     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3602   }
3603 
3604   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3605       .getReg(0);
3606 }
3607 
3608 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3609                                                   Register Index) {
3610   LLT EltTy = VecTy.getElementType();
3611 
3612   // Calculate the element offset and add it to the pointer.
3613   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3614   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3615          "Converting bits to bytes lost precision");
3616 
3617   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3618 
3619   LLT IdxTy = MRI.getType(Index);
3620   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3621                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3622 
3623   LLT PtrTy = MRI.getType(VecPtr);
3624   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3625 }
3626 
3627 #ifndef NDEBUG
3628 /// Check that all vector operands have same number of elements. Other operands
3629 /// should be listed in NonVecOp.
3630 static bool hasSameNumEltsOnAllVectorOperands(
3631     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3632     std::initializer_list<unsigned> NonVecOpIndices) {
3633   if (MI.getNumMemOperands() != 0)
3634     return false;
3635 
3636   LLT VecTy = MRI.getType(MI.getReg(0));
3637   if (!VecTy.isVector())
3638     return false;
3639   unsigned NumElts = VecTy.getNumElements();
3640 
3641   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3642     MachineOperand &Op = MI.getOperand(OpIdx);
3643     if (!Op.isReg()) {
3644       if (!is_contained(NonVecOpIndices, OpIdx))
3645         return false;
3646       continue;
3647     }
3648 
3649     LLT Ty = MRI.getType(Op.getReg());
3650     if (!Ty.isVector()) {
3651       if (!is_contained(NonVecOpIndices, OpIdx))
3652         return false;
3653       continue;
3654     }
3655 
3656     if (Ty.getNumElements() != NumElts)
3657       return false;
3658   }
3659 
3660   return true;
3661 }
3662 #endif
3663 
3664 /// Fill \p DstOps with DstOps that have same number of elements combined as
3665 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
3666 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
3667 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
3668 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
3669                        unsigned NumElts) {
3670   LLT LeftoverTy;
3671   assert(Ty.isVector() && "Expected vector type");
3672   LLT EltTy = Ty.getElementType();
3673   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
3674   int NumParts, NumLeftover;
3675   std::tie(NumParts, NumLeftover) =
3676       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
3677 
3678   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
3679   for (int i = 0; i < NumParts; ++i) {
3680     DstOps.push_back(NarrowTy);
3681   }
3682 
3683   if (LeftoverTy.isValid()) {
3684     assert(NumLeftover == 1 && "expected exactly one leftover");
3685     DstOps.push_back(LeftoverTy);
3686   }
3687 }
3688 
3689 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
3690 /// made from \p Op depending on operand type.
3691 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
3692                            MachineOperand &Op) {
3693   for (unsigned i = 0; i < N; ++i) {
3694     if (Op.isReg())
3695       Ops.push_back(Op.getReg());
3696     else if (Op.isImm())
3697       Ops.push_back(Op.getImm());
3698     else if (Op.isPredicate())
3699       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
3700     else
3701       llvm_unreachable("Unsupported type");
3702   }
3703 }
3704 
3705 // Handle splitting vector operations which need to have the same number of
3706 // elements in each type index, but each type index may have a different element
3707 // type.
3708 //
3709 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
3710 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3711 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3712 //
3713 // Also handles some irregular breakdown cases, e.g.
3714 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
3715 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
3716 //             s64 = G_SHL s64, s32
3717 LegalizerHelper::LegalizeResult
3718 LegalizerHelper::fewerElementsVectorMultiEltType(
3719     GenericMachineInstr &MI, unsigned NumElts,
3720     std::initializer_list<unsigned> NonVecOpIndices) {
3721   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
3722          "Non-compatible opcode or not specified non-vector operands");
3723   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3724 
3725   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3726   unsigned NumDefs = MI.getNumDefs();
3727 
3728   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
3729   // Build instructions with DstOps to use instruction found by CSE directly.
3730   // CSE copies found instruction into given vreg when building with vreg dest.
3731   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
3732   // Output registers will be taken from created instructions.
3733   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
3734   for (unsigned i = 0; i < NumDefs; ++i) {
3735     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
3736   }
3737 
3738   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
3739   // Operands listed in NonVecOpIndices will be used as is without splitting;
3740   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
3741   // scalar condition (op 1), immediate in sext_inreg (op 2).
3742   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
3743   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3744        ++UseIdx, ++UseNo) {
3745     if (is_contained(NonVecOpIndices, UseIdx)) {
3746       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
3747                      MI.getOperand(UseIdx));
3748     } else {
3749       SmallVector<Register, 8> SplitPieces;
3750       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces);
3751       for (auto Reg : SplitPieces)
3752         InputOpsPieces[UseNo].push_back(Reg);
3753     }
3754   }
3755 
3756   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3757 
3758   // Take i-th piece of each input operand split and build sub-vector/scalar
3759   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
3760   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3761     SmallVector<DstOp, 2> Defs;
3762     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3763       Defs.push_back(OutputOpsPieces[DstNo][i]);
3764 
3765     SmallVector<SrcOp, 3> Uses;
3766     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
3767       Uses.push_back(InputOpsPieces[InputNo][i]);
3768 
3769     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
3770     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
3771       OutputRegs[DstNo].push_back(I.getReg(DstNo));
3772   }
3773 
3774   // Merge small outputs into MI's output for each def operand.
3775   if (NumLeftovers) {
3776     for (unsigned i = 0; i < NumDefs; ++i)
3777       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
3778   } else {
3779     for (unsigned i = 0; i < NumDefs; ++i)
3780       MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
3781   }
3782 
3783   MI.eraseFromParent();
3784   return Legalized;
3785 }
3786 
3787 LegalizerHelper::LegalizeResult
3788 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
3789                                         unsigned NumElts) {
3790   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
3791 
3792   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
3793   unsigned NumDefs = MI.getNumDefs();
3794 
3795   SmallVector<DstOp, 8> OutputOpsPieces;
3796   SmallVector<Register, 8> OutputRegs;
3797   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
3798 
3799   // Instructions that perform register split will be inserted in basic block
3800   // where register is defined (basic block is in the next operand).
3801   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
3802   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
3803        UseIdx += 2, ++UseNo) {
3804     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
3805     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
3806     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]);
3807   }
3808 
3809   // Build PHIs with fewer elements.
3810   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
3811   MIRBuilder.setInsertPt(*MI.getParent(), MI);
3812   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
3813     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
3814     Phi.addDef(
3815         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
3816     OutputRegs.push_back(Phi.getReg(0));
3817 
3818     for (unsigned j = 0; j < NumInputs / 2; ++j) {
3819       Phi.addUse(InputOpsPieces[j][i]);
3820       Phi.add(MI.getOperand(1 + j * 2 + 1));
3821     }
3822   }
3823 
3824   // Merge small outputs into MI's def.
3825   if (NumLeftovers) {
3826     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
3827   } else {
3828     MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
3829   }
3830 
3831   MI.eraseFromParent();
3832   return Legalized;
3833 }
3834 
3835 LegalizerHelper::LegalizeResult
3836 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
3837                                                   unsigned TypeIdx,
3838                                                   LLT NarrowTy) {
3839   const int NumDst = MI.getNumOperands() - 1;
3840   const Register SrcReg = MI.getOperand(NumDst).getReg();
3841   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3842   LLT SrcTy = MRI.getType(SrcReg);
3843 
3844   if (TypeIdx != 1 || NarrowTy == DstTy)
3845     return UnableToLegalize;
3846 
3847   // Requires compatible types. Otherwise SrcReg should have been defined by
3848   // merge-like instruction that would get artifact combined. Most likely
3849   // instruction that defines SrcReg has to perform more/fewer elements
3850   // legalization compatible with NarrowTy.
3851   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3852   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3853 
3854   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3855       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
3856     return UnableToLegalize;
3857 
3858   // This is most likely DstTy (smaller then register size) packed in SrcTy
3859   // (larger then register size) and since unmerge was not combined it will be
3860   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
3861   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
3862 
3863   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
3864   //
3865   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
3866   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
3867   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
3868   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
3869   const int NumUnmerge = Unmerge->getNumOperands() - 1;
3870   const int PartsPerUnmerge = NumDst / NumUnmerge;
3871 
3872   for (int I = 0; I != NumUnmerge; ++I) {
3873     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
3874 
3875     for (int J = 0; J != PartsPerUnmerge; ++J)
3876       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
3877     MIB.addUse(Unmerge.getReg(I));
3878   }
3879 
3880   MI.eraseFromParent();
3881   return Legalized;
3882 }
3883 
3884 LegalizerHelper::LegalizeResult
3885 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
3886                                           LLT NarrowTy) {
3887   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3888   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
3889   // that should have been artifact combined. Most likely instruction that uses
3890   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
3891   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
3892   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3893   if (NarrowTy == SrcTy)
3894     return UnableToLegalize;
3895 
3896   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
3897   // is for old mir tests. Since the changes to more/fewer elements it should no
3898   // longer be possible to generate MIR like this when starting from llvm-ir
3899   // because LCMTy approach was replaced with merge/unmerge to vector elements.
3900   if (TypeIdx == 1) {
3901     assert(SrcTy.isVector() && "Expected vector types");
3902     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
3903     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
3904         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
3905       return UnableToLegalize;
3906     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
3907     //
3908     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
3909     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
3910     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
3911     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
3912     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
3913     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
3914 
3915     SmallVector<Register, 8> Elts;
3916     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
3917     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
3918       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
3919       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
3920         Elts.push_back(Unmerge.getReg(j));
3921     }
3922 
3923     SmallVector<Register, 8> NarrowTyElts;
3924     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
3925     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
3926     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
3927          ++i, Offset += NumNarrowTyElts) {
3928       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
3929       NarrowTyElts.push_back(
3930           MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
3931     }
3932 
3933     MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
3934     MI.eraseFromParent();
3935     return Legalized;
3936   }
3937 
3938   assert(TypeIdx == 0 && "Bad type index");
3939   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
3940       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
3941     return UnableToLegalize;
3942 
3943   // This is most likely SrcTy (smaller then register size) packed in DstTy
3944   // (larger then register size) and since merge was not combined it will be
3945   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
3946   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
3947 
3948   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
3949   //
3950   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
3951   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
3952   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
3953   SmallVector<Register, 8> NarrowTyElts;
3954   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
3955   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
3956   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
3957   for (unsigned i = 0; i < NumParts; ++i) {
3958     SmallVector<Register, 8> Sources;
3959     for (unsigned j = 0; j < NumElts; ++j)
3960       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
3961     NarrowTyElts.push_back(
3962         MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
3963   }
3964 
3965   MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
3966   MI.eraseFromParent();
3967   return Legalized;
3968 }
3969 
3970 LegalizerHelper::LegalizeResult
3971 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
3972                                                            unsigned TypeIdx,
3973                                                            LLT NarrowVecTy) {
3974   auto [DstReg, SrcVec] = MI.getFirst2Regs();
3975   Register InsertVal;
3976   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
3977 
3978   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
3979   if (IsInsert)
3980     InsertVal = MI.getOperand(2).getReg();
3981 
3982   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
3983 
3984   // TODO: Handle total scalarization case.
3985   if (!NarrowVecTy.isVector())
3986     return UnableToLegalize;
3987 
3988   LLT VecTy = MRI.getType(SrcVec);
3989 
3990   // If the index is a constant, we can really break this down as you would
3991   // expect, and index into the target size pieces.
3992   int64_t IdxVal;
3993   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
3994   if (MaybeCst) {
3995     IdxVal = MaybeCst->Value.getSExtValue();
3996     // Avoid out of bounds indexing the pieces.
3997     if (IdxVal >= VecTy.getNumElements()) {
3998       MIRBuilder.buildUndef(DstReg);
3999       MI.eraseFromParent();
4000       return Legalized;
4001     }
4002 
4003     SmallVector<Register, 8> VecParts;
4004     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4005 
4006     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4007     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4008                                     TargetOpcode::G_ANYEXT);
4009 
4010     unsigned NewNumElts = NarrowVecTy.getNumElements();
4011 
4012     LLT IdxTy = MRI.getType(Idx);
4013     int64_t PartIdx = IdxVal / NewNumElts;
4014     auto NewIdx =
4015         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4016 
4017     if (IsInsert) {
4018       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4019 
4020       // Use the adjusted index to insert into one of the subvectors.
4021       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4022           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4023       VecParts[PartIdx] = InsertPart.getReg(0);
4024 
4025       // Recombine the inserted subvector with the others to reform the result
4026       // vector.
4027       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4028     } else {
4029       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4030     }
4031 
4032     MI.eraseFromParent();
4033     return Legalized;
4034   }
4035 
4036   // With a variable index, we can't perform the operation in a smaller type, so
4037   // we're forced to expand this.
4038   //
4039   // TODO: We could emit a chain of compare/select to figure out which piece to
4040   // index.
4041   return lowerExtractInsertVectorElt(MI);
4042 }
4043 
4044 LegalizerHelper::LegalizeResult
4045 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4046                                       LLT NarrowTy) {
4047   // FIXME: Don't know how to handle secondary types yet.
4048   if (TypeIdx != 0)
4049     return UnableToLegalize;
4050 
4051   // This implementation doesn't work for atomics. Give up instead of doing
4052   // something invalid.
4053   if (LdStMI.isAtomic())
4054     return UnableToLegalize;
4055 
4056   bool IsLoad = isa<GLoad>(LdStMI);
4057   Register ValReg = LdStMI.getReg(0);
4058   Register AddrReg = LdStMI.getPointerReg();
4059   LLT ValTy = MRI.getType(ValReg);
4060 
4061   // FIXME: Do we need a distinct NarrowMemory legalize action?
4062   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4063     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4064     return UnableToLegalize;
4065   }
4066 
4067   int NumParts = -1;
4068   int NumLeftover = -1;
4069   LLT LeftoverTy;
4070   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4071   if (IsLoad) {
4072     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4073   } else {
4074     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4075                      NarrowLeftoverRegs)) {
4076       NumParts = NarrowRegs.size();
4077       NumLeftover = NarrowLeftoverRegs.size();
4078     }
4079   }
4080 
4081   if (NumParts == -1)
4082     return UnableToLegalize;
4083 
4084   LLT PtrTy = MRI.getType(AddrReg);
4085   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4086 
4087   unsigned TotalSize = ValTy.getSizeInBits();
4088 
4089   // Split the load/store into PartTy sized pieces starting at Offset. If this
4090   // is a load, return the new registers in ValRegs. For a store, each elements
4091   // of ValRegs should be PartTy. Returns the next offset that needs to be
4092   // handled.
4093   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4094   auto MMO = LdStMI.getMMO();
4095   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4096                              unsigned NumParts, unsigned Offset) -> unsigned {
4097     MachineFunction &MF = MIRBuilder.getMF();
4098     unsigned PartSize = PartTy.getSizeInBits();
4099     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4100          ++Idx) {
4101       unsigned ByteOffset = Offset / 8;
4102       Register NewAddrReg;
4103 
4104       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4105 
4106       MachineMemOperand *NewMMO =
4107           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4108 
4109       if (IsLoad) {
4110         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4111         ValRegs.push_back(Dst);
4112         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4113       } else {
4114         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4115       }
4116       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4117     }
4118 
4119     return Offset;
4120   };
4121 
4122   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4123   unsigned HandledOffset =
4124       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4125 
4126   // Handle the rest of the register if this isn't an even type breakdown.
4127   if (LeftoverTy.isValid())
4128     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4129 
4130   if (IsLoad) {
4131     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4132                 LeftoverTy, NarrowLeftoverRegs);
4133   }
4134 
4135   LdStMI.eraseFromParent();
4136   return Legalized;
4137 }
4138 
4139 LegalizerHelper::LegalizeResult
4140 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4141                                      LLT NarrowTy) {
4142   using namespace TargetOpcode;
4143   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4144   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4145 
4146   switch (MI.getOpcode()) {
4147   case G_IMPLICIT_DEF:
4148   case G_TRUNC:
4149   case G_AND:
4150   case G_OR:
4151   case G_XOR:
4152   case G_ADD:
4153   case G_SUB:
4154   case G_MUL:
4155   case G_PTR_ADD:
4156   case G_SMULH:
4157   case G_UMULH:
4158   case G_FADD:
4159   case G_FMUL:
4160   case G_FSUB:
4161   case G_FNEG:
4162   case G_FABS:
4163   case G_FCANONICALIZE:
4164   case G_FDIV:
4165   case G_FREM:
4166   case G_FMA:
4167   case G_FMAD:
4168   case G_FPOW:
4169   case G_FEXP:
4170   case G_FEXP2:
4171   case G_FLOG:
4172   case G_FLOG2:
4173   case G_FLOG10:
4174   case G_FLDEXP:
4175   case G_FNEARBYINT:
4176   case G_FCEIL:
4177   case G_FFLOOR:
4178   case G_FRINT:
4179   case G_INTRINSIC_ROUND:
4180   case G_INTRINSIC_ROUNDEVEN:
4181   case G_INTRINSIC_TRUNC:
4182   case G_FCOS:
4183   case G_FSIN:
4184   case G_FSQRT:
4185   case G_BSWAP:
4186   case G_BITREVERSE:
4187   case G_SDIV:
4188   case G_UDIV:
4189   case G_SREM:
4190   case G_UREM:
4191   case G_SDIVREM:
4192   case G_UDIVREM:
4193   case G_SMIN:
4194   case G_SMAX:
4195   case G_UMIN:
4196   case G_UMAX:
4197   case G_ABS:
4198   case G_FMINNUM:
4199   case G_FMAXNUM:
4200   case G_FMINNUM_IEEE:
4201   case G_FMAXNUM_IEEE:
4202   case G_FMINIMUM:
4203   case G_FMAXIMUM:
4204   case G_FSHL:
4205   case G_FSHR:
4206   case G_ROTL:
4207   case G_ROTR:
4208   case G_FREEZE:
4209   case G_SADDSAT:
4210   case G_SSUBSAT:
4211   case G_UADDSAT:
4212   case G_USUBSAT:
4213   case G_UMULO:
4214   case G_SMULO:
4215   case G_SHL:
4216   case G_LSHR:
4217   case G_ASHR:
4218   case G_SSHLSAT:
4219   case G_USHLSAT:
4220   case G_CTLZ:
4221   case G_CTLZ_ZERO_UNDEF:
4222   case G_CTTZ:
4223   case G_CTTZ_ZERO_UNDEF:
4224   case G_CTPOP:
4225   case G_FCOPYSIGN:
4226   case G_ZEXT:
4227   case G_SEXT:
4228   case G_ANYEXT:
4229   case G_FPEXT:
4230   case G_FPTRUNC:
4231   case G_SITOFP:
4232   case G_UITOFP:
4233   case G_FPTOSI:
4234   case G_FPTOUI:
4235   case G_INTTOPTR:
4236   case G_PTRTOINT:
4237   case G_ADDRSPACE_CAST:
4238   case G_UADDO:
4239   case G_USUBO:
4240   case G_UADDE:
4241   case G_USUBE:
4242   case G_SADDO:
4243   case G_SSUBO:
4244   case G_SADDE:
4245   case G_SSUBE:
4246   case G_STRICT_FADD:
4247   case G_STRICT_FSUB:
4248   case G_STRICT_FMUL:
4249   case G_STRICT_FMA:
4250   case G_STRICT_FLDEXP:
4251   case G_FFREXP:
4252     return fewerElementsVectorMultiEltType(GMI, NumElts);
4253   case G_ICMP:
4254   case G_FCMP:
4255     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4256   case G_IS_FPCLASS:
4257     return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
4258   case G_SELECT:
4259     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4260       return fewerElementsVectorMultiEltType(GMI, NumElts);
4261     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4262   case G_PHI:
4263     return fewerElementsVectorPhi(GMI, NumElts);
4264   case G_UNMERGE_VALUES:
4265     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4266   case G_BUILD_VECTOR:
4267     assert(TypeIdx == 0 && "not a vector type index");
4268     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4269   case G_CONCAT_VECTORS:
4270     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4271       return UnableToLegalize;
4272     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4273   case G_EXTRACT_VECTOR_ELT:
4274   case G_INSERT_VECTOR_ELT:
4275     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4276   case G_LOAD:
4277   case G_STORE:
4278     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4279   case G_SEXT_INREG:
4280     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4281   GISEL_VECREDUCE_CASES_NONSEQ
4282     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4283   case G_SHUFFLE_VECTOR:
4284     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4285   default:
4286     return UnableToLegalize;
4287   }
4288 }
4289 
4290 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4291     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4292   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4293   if (TypeIdx != 0)
4294     return UnableToLegalize;
4295 
4296   auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
4297       MI.getFirst3RegLLTs();
4298   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4299   // The shuffle should be canonicalized by now.
4300   if (DstTy != Src1Ty)
4301     return UnableToLegalize;
4302   if (DstTy != Src2Ty)
4303     return UnableToLegalize;
4304 
4305   if (!isPowerOf2_32(DstTy.getNumElements()))
4306     return UnableToLegalize;
4307 
4308   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4309   // Further legalization attempts will be needed to do split further.
4310   NarrowTy =
4311       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4312   unsigned NewElts = NarrowTy.getNumElements();
4313 
4314   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4315   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs);
4316   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs);
4317   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4318                         SplitSrc2Regs[1]};
4319 
4320   Register Hi, Lo;
4321 
4322   // If Lo or Hi uses elements from at most two of the four input vectors, then
4323   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4324   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4325   SmallVector<int, 16> Ops;
4326   for (unsigned High = 0; High < 2; ++High) {
4327     Register &Output = High ? Hi : Lo;
4328 
4329     // Build a shuffle mask for the output, discovering on the fly which
4330     // input vectors to use as shuffle operands (recorded in InputUsed).
4331     // If building a suitable shuffle vector proves too hard, then bail
4332     // out with useBuildVector set.
4333     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4334     unsigned FirstMaskIdx = High * NewElts;
4335     bool UseBuildVector = false;
4336     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4337       // The mask element.  This indexes into the input.
4338       int Idx = Mask[FirstMaskIdx + MaskOffset];
4339 
4340       // The input vector this mask element indexes into.
4341       unsigned Input = (unsigned)Idx / NewElts;
4342 
4343       if (Input >= std::size(Inputs)) {
4344         // The mask element does not index into any input vector.
4345         Ops.push_back(-1);
4346         continue;
4347       }
4348 
4349       // Turn the index into an offset from the start of the input vector.
4350       Idx -= Input * NewElts;
4351 
4352       // Find or create a shuffle vector operand to hold this input.
4353       unsigned OpNo;
4354       for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
4355         if (InputUsed[OpNo] == Input) {
4356           // This input vector is already an operand.
4357           break;
4358         } else if (InputUsed[OpNo] == -1U) {
4359           // Create a new operand for this input vector.
4360           InputUsed[OpNo] = Input;
4361           break;
4362         }
4363       }
4364 
4365       if (OpNo >= std::size(InputUsed)) {
4366         // More than two input vectors used!  Give up on trying to create a
4367         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4368         UseBuildVector = true;
4369         break;
4370       }
4371 
4372       // Add the mask index for the new shuffle vector.
4373       Ops.push_back(Idx + OpNo * NewElts);
4374     }
4375 
4376     if (UseBuildVector) {
4377       LLT EltTy = NarrowTy.getElementType();
4378       SmallVector<Register, 16> SVOps;
4379 
4380       // Extract the input elements by hand.
4381       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4382         // The mask element.  This indexes into the input.
4383         int Idx = Mask[FirstMaskIdx + MaskOffset];
4384 
4385         // The input vector this mask element indexes into.
4386         unsigned Input = (unsigned)Idx / NewElts;
4387 
4388         if (Input >= std::size(Inputs)) {
4389           // The mask element is "undef" or indexes off the end of the input.
4390           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4391           continue;
4392         }
4393 
4394         // Turn the index into an offset from the start of the input vector.
4395         Idx -= Input * NewElts;
4396 
4397         // Extract the vector element by hand.
4398         SVOps.push_back(MIRBuilder
4399                             .buildExtractVectorElement(
4400                                 EltTy, Inputs[Input],
4401                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4402                             .getReg(0));
4403       }
4404 
4405       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4406       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4407     } else if (InputUsed[0] == -1U) {
4408       // No input vectors were used! The result is undefined.
4409       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4410     } else {
4411       Register Op0 = Inputs[InputUsed[0]];
4412       // If only one input was used, use an undefined vector for the other.
4413       Register Op1 = InputUsed[1] == -1U
4414                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4415                          : Inputs[InputUsed[1]];
4416       // At least one input vector was used. Create a new shuffle vector.
4417       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4418     }
4419 
4420     Ops.clear();
4421   }
4422 
4423   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4424   MI.eraseFromParent();
4425   return Legalized;
4426 }
4427 
4428 static unsigned getScalarOpcForReduction(unsigned Opc) {
4429   unsigned ScalarOpc;
4430   switch (Opc) {
4431   case TargetOpcode::G_VECREDUCE_FADD:
4432     ScalarOpc = TargetOpcode::G_FADD;
4433     break;
4434   case TargetOpcode::G_VECREDUCE_FMUL:
4435     ScalarOpc = TargetOpcode::G_FMUL;
4436     break;
4437   case TargetOpcode::G_VECREDUCE_FMAX:
4438     ScalarOpc = TargetOpcode::G_FMAXNUM;
4439     break;
4440   case TargetOpcode::G_VECREDUCE_FMIN:
4441     ScalarOpc = TargetOpcode::G_FMINNUM;
4442     break;
4443   case TargetOpcode::G_VECREDUCE_ADD:
4444     ScalarOpc = TargetOpcode::G_ADD;
4445     break;
4446   case TargetOpcode::G_VECREDUCE_MUL:
4447     ScalarOpc = TargetOpcode::G_MUL;
4448     break;
4449   case TargetOpcode::G_VECREDUCE_AND:
4450     ScalarOpc = TargetOpcode::G_AND;
4451     break;
4452   case TargetOpcode::G_VECREDUCE_OR:
4453     ScalarOpc = TargetOpcode::G_OR;
4454     break;
4455   case TargetOpcode::G_VECREDUCE_XOR:
4456     ScalarOpc = TargetOpcode::G_XOR;
4457     break;
4458   case TargetOpcode::G_VECREDUCE_SMAX:
4459     ScalarOpc = TargetOpcode::G_SMAX;
4460     break;
4461   case TargetOpcode::G_VECREDUCE_SMIN:
4462     ScalarOpc = TargetOpcode::G_SMIN;
4463     break;
4464   case TargetOpcode::G_VECREDUCE_UMAX:
4465     ScalarOpc = TargetOpcode::G_UMAX;
4466     break;
4467   case TargetOpcode::G_VECREDUCE_UMIN:
4468     ScalarOpc = TargetOpcode::G_UMIN;
4469     break;
4470   default:
4471     llvm_unreachable("Unhandled reduction");
4472   }
4473   return ScalarOpc;
4474 }
4475 
4476 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4477     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4478   unsigned Opc = MI.getOpcode();
4479   assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
4480          Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
4481          "Sequential reductions not expected");
4482 
4483   if (TypeIdx != 1)
4484     return UnableToLegalize;
4485 
4486   // The semantics of the normal non-sequential reductions allow us to freely
4487   // re-associate the operation.
4488   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4489 
4490   if (NarrowTy.isVector() &&
4491       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4492     return UnableToLegalize;
4493 
4494   unsigned ScalarOpc = getScalarOpcForReduction(Opc);
4495   SmallVector<Register> SplitSrcs;
4496   // If NarrowTy is a scalar then we're being asked to scalarize.
4497   const unsigned NumParts =
4498       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4499                           : SrcTy.getNumElements();
4500 
4501   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
4502   if (NarrowTy.isScalar()) {
4503     if (DstTy != NarrowTy)
4504       return UnableToLegalize; // FIXME: handle implicit extensions.
4505 
4506     if (isPowerOf2_32(NumParts)) {
4507       // Generate a tree of scalar operations to reduce the critical path.
4508       SmallVector<Register> PartialResults;
4509       unsigned NumPartsLeft = NumParts;
4510       while (NumPartsLeft > 1) {
4511         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4512           PartialResults.emplace_back(
4513               MIRBuilder
4514                   .buildInstr(ScalarOpc, {NarrowTy},
4515                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4516                   .getReg(0));
4517         }
4518         SplitSrcs = PartialResults;
4519         PartialResults.clear();
4520         NumPartsLeft = SplitSrcs.size();
4521       }
4522       assert(SplitSrcs.size() == 1);
4523       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4524       MI.eraseFromParent();
4525       return Legalized;
4526     }
4527     // If we can't generate a tree, then just do sequential operations.
4528     Register Acc = SplitSrcs[0];
4529     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4530       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4531                 .getReg(0);
4532     MIRBuilder.buildCopy(DstReg, Acc);
4533     MI.eraseFromParent();
4534     return Legalized;
4535   }
4536   SmallVector<Register> PartialReductions;
4537   for (unsigned Part = 0; Part < NumParts; ++Part) {
4538     PartialReductions.push_back(
4539         MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
4540   }
4541 
4542 
4543   // If the types involved are powers of 2, we can generate intermediate vector
4544   // ops, before generating a final reduction operation.
4545   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4546       isPowerOf2_32(NarrowTy.getNumElements())) {
4547     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4548   }
4549 
4550   Register Acc = PartialReductions[0];
4551   for (unsigned Part = 1; Part < NumParts; ++Part) {
4552     if (Part == NumParts - 1) {
4553       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4554                             {Acc, PartialReductions[Part]});
4555     } else {
4556       Acc = MIRBuilder
4557                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4558                 .getReg(0);
4559     }
4560   }
4561   MI.eraseFromParent();
4562   return Legalized;
4563 }
4564 
4565 LegalizerHelper::LegalizeResult
4566 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4567                                         LLT SrcTy, LLT NarrowTy,
4568                                         unsigned ScalarOpc) {
4569   SmallVector<Register> SplitSrcs;
4570   // Split the sources into NarrowTy size pieces.
4571   extractParts(SrcReg, NarrowTy,
4572                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs);
4573   // We're going to do a tree reduction using vector operations until we have
4574   // one NarrowTy size value left.
4575   while (SplitSrcs.size() > 1) {
4576     SmallVector<Register> PartialRdxs;
4577     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4578       Register LHS = SplitSrcs[Idx];
4579       Register RHS = SplitSrcs[Idx + 1];
4580       // Create the intermediate vector op.
4581       Register Res =
4582           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4583       PartialRdxs.push_back(Res);
4584     }
4585     SplitSrcs = std::move(PartialRdxs);
4586   }
4587   // Finally generate the requested NarrowTy based reduction.
4588   Observer.changingInstr(MI);
4589   MI.getOperand(1).setReg(SplitSrcs[0]);
4590   Observer.changedInstr(MI);
4591   return Legalized;
4592 }
4593 
4594 LegalizerHelper::LegalizeResult
4595 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4596                                              const LLT HalfTy, const LLT AmtTy) {
4597 
4598   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4599   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4600   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4601 
4602   if (Amt.isZero()) {
4603     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
4604     MI.eraseFromParent();
4605     return Legalized;
4606   }
4607 
4608   LLT NVT = HalfTy;
4609   unsigned NVTBits = HalfTy.getSizeInBits();
4610   unsigned VTBits = 2 * NVTBits;
4611 
4612   SrcOp Lo(Register(0)), Hi(Register(0));
4613   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4614     if (Amt.ugt(VTBits)) {
4615       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4616     } else if (Amt.ugt(NVTBits)) {
4617       Lo = MIRBuilder.buildConstant(NVT, 0);
4618       Hi = MIRBuilder.buildShl(NVT, InL,
4619                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4620     } else if (Amt == NVTBits) {
4621       Lo = MIRBuilder.buildConstant(NVT, 0);
4622       Hi = InL;
4623     } else {
4624       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4625       auto OrLHS =
4626           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4627       auto OrRHS = MIRBuilder.buildLShr(
4628           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4629       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4630     }
4631   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4632     if (Amt.ugt(VTBits)) {
4633       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4634     } else if (Amt.ugt(NVTBits)) {
4635       Lo = MIRBuilder.buildLShr(NVT, InH,
4636                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4637       Hi = MIRBuilder.buildConstant(NVT, 0);
4638     } else if (Amt == NVTBits) {
4639       Lo = InH;
4640       Hi = MIRBuilder.buildConstant(NVT, 0);
4641     } else {
4642       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4643 
4644       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4645       auto OrRHS = MIRBuilder.buildShl(
4646           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4647 
4648       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4649       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4650     }
4651   } else {
4652     if (Amt.ugt(VTBits)) {
4653       Hi = Lo = MIRBuilder.buildAShr(
4654           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4655     } else if (Amt.ugt(NVTBits)) {
4656       Lo = MIRBuilder.buildAShr(NVT, InH,
4657                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4658       Hi = MIRBuilder.buildAShr(NVT, InH,
4659                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4660     } else if (Amt == NVTBits) {
4661       Lo = InH;
4662       Hi = MIRBuilder.buildAShr(NVT, InH,
4663                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
4664     } else {
4665       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4666 
4667       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4668       auto OrRHS = MIRBuilder.buildShl(
4669           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4670 
4671       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4672       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
4673     }
4674   }
4675 
4676   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
4677   MI.eraseFromParent();
4678 
4679   return Legalized;
4680 }
4681 
4682 // TODO: Optimize if constant shift amount.
4683 LegalizerHelper::LegalizeResult
4684 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
4685                                    LLT RequestedTy) {
4686   if (TypeIdx == 1) {
4687     Observer.changingInstr(MI);
4688     narrowScalarSrc(MI, RequestedTy, 2);
4689     Observer.changedInstr(MI);
4690     return Legalized;
4691   }
4692 
4693   Register DstReg = MI.getOperand(0).getReg();
4694   LLT DstTy = MRI.getType(DstReg);
4695   if (DstTy.isVector())
4696     return UnableToLegalize;
4697 
4698   Register Amt = MI.getOperand(2).getReg();
4699   LLT ShiftAmtTy = MRI.getType(Amt);
4700   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
4701   if (DstEltSize % 2 != 0)
4702     return UnableToLegalize;
4703 
4704   // Ignore the input type. We can only go to exactly half the size of the
4705   // input. If that isn't small enough, the resulting pieces will be further
4706   // legalized.
4707   const unsigned NewBitSize = DstEltSize / 2;
4708   const LLT HalfTy = LLT::scalar(NewBitSize);
4709   const LLT CondTy = LLT::scalar(1);
4710 
4711   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
4712     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
4713                                        ShiftAmtTy);
4714   }
4715 
4716   // TODO: Expand with known bits.
4717 
4718   // Handle the fully general expansion by an unknown amount.
4719   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
4720 
4721   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4722   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4723   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4724 
4725   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
4726   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
4727 
4728   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
4729   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
4730   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
4731 
4732   Register ResultRegs[2];
4733   switch (MI.getOpcode()) {
4734   case TargetOpcode::G_SHL: {
4735     // Short: ShAmt < NewBitSize
4736     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
4737 
4738     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
4739     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
4740     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4741 
4742     // Long: ShAmt >= NewBitSize
4743     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
4744     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
4745 
4746     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
4747     auto Hi = MIRBuilder.buildSelect(
4748         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
4749 
4750     ResultRegs[0] = Lo.getReg(0);
4751     ResultRegs[1] = Hi.getReg(0);
4752     break;
4753   }
4754   case TargetOpcode::G_LSHR:
4755   case TargetOpcode::G_ASHR: {
4756     // Short: ShAmt < NewBitSize
4757     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
4758 
4759     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
4760     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
4761     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
4762 
4763     // Long: ShAmt >= NewBitSize
4764     MachineInstrBuilder HiL;
4765     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4766       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
4767     } else {
4768       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
4769       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
4770     }
4771     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
4772                                      {InH, AmtExcess});     // Lo from Hi part.
4773 
4774     auto Lo = MIRBuilder.buildSelect(
4775         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
4776 
4777     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
4778 
4779     ResultRegs[0] = Lo.getReg(0);
4780     ResultRegs[1] = Hi.getReg(0);
4781     break;
4782   }
4783   default:
4784     llvm_unreachable("not a shift");
4785   }
4786 
4787   MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
4788   MI.eraseFromParent();
4789   return Legalized;
4790 }
4791 
4792 LegalizerHelper::LegalizeResult
4793 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
4794                                        LLT MoreTy) {
4795   assert(TypeIdx == 0 && "Expecting only Idx 0");
4796 
4797   Observer.changingInstr(MI);
4798   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
4799     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
4800     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
4801     moreElementsVectorSrc(MI, MoreTy, I);
4802   }
4803 
4804   MachineBasicBlock &MBB = *MI.getParent();
4805   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
4806   moreElementsVectorDst(MI, MoreTy, 0);
4807   Observer.changedInstr(MI);
4808   return Legalized;
4809 }
4810 
4811 LegalizerHelper::LegalizeResult
4812 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
4813                                     LLT MoreTy) {
4814   unsigned Opc = MI.getOpcode();
4815   switch (Opc) {
4816   case TargetOpcode::G_IMPLICIT_DEF:
4817   case TargetOpcode::G_LOAD: {
4818     if (TypeIdx != 0)
4819       return UnableToLegalize;
4820     Observer.changingInstr(MI);
4821     moreElementsVectorDst(MI, MoreTy, 0);
4822     Observer.changedInstr(MI);
4823     return Legalized;
4824   }
4825   case TargetOpcode::G_STORE:
4826     if (TypeIdx != 0)
4827       return UnableToLegalize;
4828     Observer.changingInstr(MI);
4829     moreElementsVectorSrc(MI, MoreTy, 0);
4830     Observer.changedInstr(MI);
4831     return Legalized;
4832   case TargetOpcode::G_AND:
4833   case TargetOpcode::G_OR:
4834   case TargetOpcode::G_XOR:
4835   case TargetOpcode::G_ADD:
4836   case TargetOpcode::G_SUB:
4837   case TargetOpcode::G_MUL:
4838   case TargetOpcode::G_FADD:
4839   case TargetOpcode::G_FMUL:
4840   case TargetOpcode::G_UADDSAT:
4841   case TargetOpcode::G_USUBSAT:
4842   case TargetOpcode::G_SADDSAT:
4843   case TargetOpcode::G_SSUBSAT:
4844   case TargetOpcode::G_SMIN:
4845   case TargetOpcode::G_SMAX:
4846   case TargetOpcode::G_UMIN:
4847   case TargetOpcode::G_UMAX:
4848   case TargetOpcode::G_FMINNUM:
4849   case TargetOpcode::G_FMAXNUM:
4850   case TargetOpcode::G_FMINNUM_IEEE:
4851   case TargetOpcode::G_FMAXNUM_IEEE:
4852   case TargetOpcode::G_FMINIMUM:
4853   case TargetOpcode::G_FMAXIMUM:
4854   case TargetOpcode::G_STRICT_FADD:
4855   case TargetOpcode::G_STRICT_FSUB:
4856   case TargetOpcode::G_STRICT_FMUL: {
4857     Observer.changingInstr(MI);
4858     moreElementsVectorSrc(MI, MoreTy, 1);
4859     moreElementsVectorSrc(MI, MoreTy, 2);
4860     moreElementsVectorDst(MI, MoreTy, 0);
4861     Observer.changedInstr(MI);
4862     return Legalized;
4863   }
4864   case TargetOpcode::G_FMA:
4865   case TargetOpcode::G_STRICT_FMA:
4866   case TargetOpcode::G_FSHR:
4867   case TargetOpcode::G_FSHL: {
4868     Observer.changingInstr(MI);
4869     moreElementsVectorSrc(MI, MoreTy, 1);
4870     moreElementsVectorSrc(MI, MoreTy, 2);
4871     moreElementsVectorSrc(MI, MoreTy, 3);
4872     moreElementsVectorDst(MI, MoreTy, 0);
4873     Observer.changedInstr(MI);
4874     return Legalized;
4875   }
4876   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4877   case TargetOpcode::G_EXTRACT:
4878     if (TypeIdx != 1)
4879       return UnableToLegalize;
4880     Observer.changingInstr(MI);
4881     moreElementsVectorSrc(MI, MoreTy, 1);
4882     Observer.changedInstr(MI);
4883     return Legalized;
4884   case TargetOpcode::G_INSERT:
4885   case TargetOpcode::G_INSERT_VECTOR_ELT:
4886   case TargetOpcode::G_FREEZE:
4887   case TargetOpcode::G_FNEG:
4888   case TargetOpcode::G_FABS:
4889   case TargetOpcode::G_BSWAP:
4890   case TargetOpcode::G_FCANONICALIZE:
4891   case TargetOpcode::G_SEXT_INREG:
4892     if (TypeIdx != 0)
4893       return UnableToLegalize;
4894     Observer.changingInstr(MI);
4895     moreElementsVectorSrc(MI, MoreTy, 1);
4896     moreElementsVectorDst(MI, MoreTy, 0);
4897     Observer.changedInstr(MI);
4898     return Legalized;
4899   case TargetOpcode::G_SELECT: {
4900     auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
4901     if (TypeIdx == 1) {
4902       if (!CondTy.isScalar() ||
4903           DstTy.getElementCount() != MoreTy.getElementCount())
4904         return UnableToLegalize;
4905 
4906       // This is turning a scalar select of vectors into a vector
4907       // select. Broadcast the select condition.
4908       auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
4909       Observer.changingInstr(MI);
4910       MI.getOperand(1).setReg(ShufSplat.getReg(0));
4911       Observer.changedInstr(MI);
4912       return Legalized;
4913     }
4914 
4915     if (CondTy.isVector())
4916       return UnableToLegalize;
4917 
4918     Observer.changingInstr(MI);
4919     moreElementsVectorSrc(MI, MoreTy, 2);
4920     moreElementsVectorSrc(MI, MoreTy, 3);
4921     moreElementsVectorDst(MI, MoreTy, 0);
4922     Observer.changedInstr(MI);
4923     return Legalized;
4924   }
4925   case TargetOpcode::G_UNMERGE_VALUES:
4926     return UnableToLegalize;
4927   case TargetOpcode::G_PHI:
4928     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
4929   case TargetOpcode::G_SHUFFLE_VECTOR:
4930     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
4931   case TargetOpcode::G_BUILD_VECTOR: {
4932     SmallVector<SrcOp, 8> Elts;
4933     for (auto Op : MI.uses()) {
4934       Elts.push_back(Op.getReg());
4935     }
4936 
4937     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
4938       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
4939     }
4940 
4941     MIRBuilder.buildDeleteTrailingVectorElements(
4942         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
4943     MI.eraseFromParent();
4944     return Legalized;
4945   }
4946   case TargetOpcode::G_TRUNC: {
4947     Observer.changingInstr(MI);
4948     moreElementsVectorSrc(MI, MoreTy, 1);
4949     moreElementsVectorDst(MI, MoreTy, 0);
4950     Observer.changedInstr(MI);
4951     return Legalized;
4952   }
4953   case TargetOpcode::G_FPTRUNC:
4954   case TargetOpcode::G_FPEXT: {
4955     if (TypeIdx != 0)
4956       return UnableToLegalize;
4957     Observer.changingInstr(MI);
4958     LLT SrcTy = LLT::fixed_vector(
4959         MoreTy.getNumElements(),
4960         MRI.getType(MI.getOperand(1).getReg()).getElementType());
4961     moreElementsVectorSrc(MI, SrcTy, 1);
4962     moreElementsVectorDst(MI, MoreTy, 0);
4963     Observer.changedInstr(MI);
4964     return Legalized;
4965   }
4966   default:
4967     return UnableToLegalize;
4968   }
4969 }
4970 
4971 LegalizerHelper::LegalizeResult
4972 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
4973   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4974   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4975   unsigned MaskNumElts = Mask.size();
4976   unsigned SrcNumElts = SrcTy.getNumElements();
4977   LLT DestEltTy = DstTy.getElementType();
4978 
4979   if (MaskNumElts == SrcNumElts)
4980     return Legalized;
4981 
4982   if (MaskNumElts < SrcNumElts) {
4983     // Extend mask to match new destination vector size with
4984     // undef values.
4985     SmallVector<int, 16> NewMask(Mask);
4986     for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
4987       NewMask.push_back(-1);
4988 
4989     moreElementsVectorDst(MI, SrcTy, 0);
4990     MIRBuilder.setInstrAndDebugLoc(MI);
4991     MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
4992                                   MI.getOperand(1).getReg(),
4993                                   MI.getOperand(2).getReg(), NewMask);
4994     MI.eraseFromParent();
4995 
4996     return Legalized;
4997   }
4998 
4999   unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
5000   unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
5001   LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
5002 
5003   // Create new source vectors by concatenating the initial
5004   // source vectors with undefined vectors of the same size.
5005   auto Undef = MIRBuilder.buildUndef(SrcTy);
5006   SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
5007   SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
5008   MOps1[0] = MI.getOperand(1).getReg();
5009   MOps2[0] = MI.getOperand(2).getReg();
5010 
5011   auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
5012   auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
5013 
5014   // Readjust mask for new input vector length.
5015   SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
5016   for (unsigned I = 0; I != MaskNumElts; ++I) {
5017     int Idx = Mask[I];
5018     if (Idx >= static_cast<int>(SrcNumElts))
5019       Idx += PaddedMaskNumElts - SrcNumElts;
5020     MappedOps[I] = Idx;
5021   }
5022 
5023   // If we got more elements than required, extract subvector.
5024   if (MaskNumElts != PaddedMaskNumElts) {
5025     auto Shuffle =
5026         MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
5027 
5028     SmallVector<Register, 16> Elts(MaskNumElts);
5029     for (unsigned I = 0; I < MaskNumElts; ++I) {
5030       Elts[I] =
5031           MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
5032               .getReg(0);
5033     }
5034     MIRBuilder.buildBuildVector(DstReg, Elts);
5035   } else {
5036     MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
5037   }
5038 
5039   MI.eraseFromParent();
5040   return LegalizerHelper::LegalizeResult::Legalized;
5041 }
5042 
5043 LegalizerHelper::LegalizeResult
5044 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5045                                            unsigned int TypeIdx, LLT MoreTy) {
5046   auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
5047   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5048   unsigned NumElts = DstTy.getNumElements();
5049   unsigned WidenNumElts = MoreTy.getNumElements();
5050 
5051   if (DstTy.isVector() && Src1Ty.isVector() &&
5052       DstTy.getNumElements() != Src1Ty.getNumElements()) {
5053     return equalizeVectorShuffleLengths(MI);
5054   }
5055 
5056   if (TypeIdx != 0)
5057     return UnableToLegalize;
5058 
5059   // Expect a canonicalized shuffle.
5060   if (DstTy != Src1Ty || DstTy != Src2Ty)
5061     return UnableToLegalize;
5062 
5063   moreElementsVectorSrc(MI, MoreTy, 1);
5064   moreElementsVectorSrc(MI, MoreTy, 2);
5065 
5066   // Adjust mask based on new input vector length.
5067   SmallVector<int, 16> NewMask;
5068   for (unsigned I = 0; I != NumElts; ++I) {
5069     int Idx = Mask[I];
5070     if (Idx < static_cast<int>(NumElts))
5071       NewMask.push_back(Idx);
5072     else
5073       NewMask.push_back(Idx - NumElts + WidenNumElts);
5074   }
5075   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5076     NewMask.push_back(-1);
5077   moreElementsVectorDst(MI, MoreTy, 0);
5078   MIRBuilder.setInstrAndDebugLoc(MI);
5079   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5080                                 MI.getOperand(1).getReg(),
5081                                 MI.getOperand(2).getReg(), NewMask);
5082   MI.eraseFromParent();
5083   return Legalized;
5084 }
5085 
5086 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5087                                         ArrayRef<Register> Src1Regs,
5088                                         ArrayRef<Register> Src2Regs,
5089                                         LLT NarrowTy) {
5090   MachineIRBuilder &B = MIRBuilder;
5091   unsigned SrcParts = Src1Regs.size();
5092   unsigned DstParts = DstRegs.size();
5093 
5094   unsigned DstIdx = 0; // Low bits of the result.
5095   Register FactorSum =
5096       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5097   DstRegs[DstIdx] = FactorSum;
5098 
5099   unsigned CarrySumPrevDstIdx;
5100   SmallVector<Register, 4> Factors;
5101 
5102   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5103     // Collect low parts of muls for DstIdx.
5104     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5105          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5106       MachineInstrBuilder Mul =
5107           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5108       Factors.push_back(Mul.getReg(0));
5109     }
5110     // Collect high parts of muls from previous DstIdx.
5111     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5112          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5113       MachineInstrBuilder Umulh =
5114           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5115       Factors.push_back(Umulh.getReg(0));
5116     }
5117     // Add CarrySum from additions calculated for previous DstIdx.
5118     if (DstIdx != 1) {
5119       Factors.push_back(CarrySumPrevDstIdx);
5120     }
5121 
5122     Register CarrySum;
5123     // Add all factors and accumulate all carries into CarrySum.
5124     if (DstIdx != DstParts - 1) {
5125       MachineInstrBuilder Uaddo =
5126           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5127       FactorSum = Uaddo.getReg(0);
5128       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5129       for (unsigned i = 2; i < Factors.size(); ++i) {
5130         MachineInstrBuilder Uaddo =
5131             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5132         FactorSum = Uaddo.getReg(0);
5133         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5134         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5135       }
5136     } else {
5137       // Since value for the next index is not calculated, neither is CarrySum.
5138       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5139       for (unsigned i = 2; i < Factors.size(); ++i)
5140         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5141     }
5142 
5143     CarrySumPrevDstIdx = CarrySum;
5144     DstRegs[DstIdx] = FactorSum;
5145     Factors.clear();
5146   }
5147 }
5148 
5149 LegalizerHelper::LegalizeResult
5150 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5151                                     LLT NarrowTy) {
5152   if (TypeIdx != 0)
5153     return UnableToLegalize;
5154 
5155   Register DstReg = MI.getOperand(0).getReg();
5156   LLT DstType = MRI.getType(DstReg);
5157   // FIXME: add support for vector types
5158   if (DstType.isVector())
5159     return UnableToLegalize;
5160 
5161   unsigned Opcode = MI.getOpcode();
5162   unsigned OpO, OpE, OpF;
5163   switch (Opcode) {
5164   case TargetOpcode::G_SADDO:
5165   case TargetOpcode::G_SADDE:
5166   case TargetOpcode::G_UADDO:
5167   case TargetOpcode::G_UADDE:
5168   case TargetOpcode::G_ADD:
5169     OpO = TargetOpcode::G_UADDO;
5170     OpE = TargetOpcode::G_UADDE;
5171     OpF = TargetOpcode::G_UADDE;
5172     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5173       OpF = TargetOpcode::G_SADDE;
5174     break;
5175   case TargetOpcode::G_SSUBO:
5176   case TargetOpcode::G_SSUBE:
5177   case TargetOpcode::G_USUBO:
5178   case TargetOpcode::G_USUBE:
5179   case TargetOpcode::G_SUB:
5180     OpO = TargetOpcode::G_USUBO;
5181     OpE = TargetOpcode::G_USUBE;
5182     OpF = TargetOpcode::G_USUBE;
5183     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5184       OpF = TargetOpcode::G_SSUBE;
5185     break;
5186   default:
5187     llvm_unreachable("Unexpected add/sub opcode!");
5188   }
5189 
5190   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5191   unsigned NumDefs = MI.getNumExplicitDefs();
5192   Register Src1 = MI.getOperand(NumDefs).getReg();
5193   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5194   Register CarryDst, CarryIn;
5195   if (NumDefs == 2)
5196     CarryDst = MI.getOperand(1).getReg();
5197   if (MI.getNumOperands() == NumDefs + 3)
5198     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5199 
5200   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5201   LLT LeftoverTy, DummyTy;
5202   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5203   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left);
5204   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left);
5205 
5206   int NarrowParts = Src1Regs.size();
5207   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5208     Src1Regs.push_back(Src1Left[I]);
5209     Src2Regs.push_back(Src2Left[I]);
5210   }
5211   DstRegs.reserve(Src1Regs.size());
5212 
5213   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5214     Register DstReg =
5215         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5216     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5217     // Forward the final carry-out to the destination register
5218     if (i == e - 1 && CarryDst)
5219       CarryOut = CarryDst;
5220 
5221     if (!CarryIn) {
5222       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5223                             {Src1Regs[i], Src2Regs[i]});
5224     } else if (i == e - 1) {
5225       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5226                             {Src1Regs[i], Src2Regs[i], CarryIn});
5227     } else {
5228       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5229                             {Src1Regs[i], Src2Regs[i], CarryIn});
5230     }
5231 
5232     DstRegs.push_back(DstReg);
5233     CarryIn = CarryOut;
5234   }
5235   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5236               ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5237               ArrayRef(DstRegs).drop_front(NarrowParts));
5238 
5239   MI.eraseFromParent();
5240   return Legalized;
5241 }
5242 
5243 LegalizerHelper::LegalizeResult
5244 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5245   auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
5246 
5247   LLT Ty = MRI.getType(DstReg);
5248   if (Ty.isVector())
5249     return UnableToLegalize;
5250 
5251   unsigned Size = Ty.getSizeInBits();
5252   unsigned NarrowSize = NarrowTy.getSizeInBits();
5253   if (Size % NarrowSize != 0)
5254     return UnableToLegalize;
5255 
5256   unsigned NumParts = Size / NarrowSize;
5257   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5258   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5259 
5260   SmallVector<Register, 2> Src1Parts, Src2Parts;
5261   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5262   extractParts(Src1, NarrowTy, NumParts, Src1Parts);
5263   extractParts(Src2, NarrowTy, NumParts, Src2Parts);
5264   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5265 
5266   // Take only high half of registers if this is high mul.
5267   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5268   MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5269   MI.eraseFromParent();
5270   return Legalized;
5271 }
5272 
5273 LegalizerHelper::LegalizeResult
5274 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5275                                    LLT NarrowTy) {
5276   if (TypeIdx != 0)
5277     return UnableToLegalize;
5278 
5279   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5280 
5281   Register Src = MI.getOperand(1).getReg();
5282   LLT SrcTy = MRI.getType(Src);
5283 
5284   // If all finite floats fit into the narrowed integer type, we can just swap
5285   // out the result type. This is practically only useful for conversions from
5286   // half to at least 16-bits, so just handle the one case.
5287   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5288       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5289     return UnableToLegalize;
5290 
5291   Observer.changingInstr(MI);
5292   narrowScalarDst(MI, NarrowTy, 0,
5293                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5294   Observer.changedInstr(MI);
5295   return Legalized;
5296 }
5297 
5298 LegalizerHelper::LegalizeResult
5299 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5300                                      LLT NarrowTy) {
5301   if (TypeIdx != 1)
5302     return UnableToLegalize;
5303 
5304   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5305 
5306   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5307   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5308   // NarrowSize.
5309   if (SizeOp1 % NarrowSize != 0)
5310     return UnableToLegalize;
5311   int NumParts = SizeOp1 / NarrowSize;
5312 
5313   SmallVector<Register, 2> SrcRegs, DstRegs;
5314   SmallVector<uint64_t, 2> Indexes;
5315   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
5316 
5317   Register OpReg = MI.getOperand(0).getReg();
5318   uint64_t OpStart = MI.getOperand(2).getImm();
5319   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5320   for (int i = 0; i < NumParts; ++i) {
5321     unsigned SrcStart = i * NarrowSize;
5322 
5323     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5324       // No part of the extract uses this subregister, ignore it.
5325       continue;
5326     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5327       // The entire subregister is extracted, forward the value.
5328       DstRegs.push_back(SrcRegs[i]);
5329       continue;
5330     }
5331 
5332     // OpSegStart is where this destination segment would start in OpReg if it
5333     // extended infinitely in both directions.
5334     int64_t ExtractOffset;
5335     uint64_t SegSize;
5336     if (OpStart < SrcStart) {
5337       ExtractOffset = 0;
5338       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5339     } else {
5340       ExtractOffset = OpStart - SrcStart;
5341       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5342     }
5343 
5344     Register SegReg = SrcRegs[i];
5345     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5346       // A genuine extract is needed.
5347       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5348       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5349     }
5350 
5351     DstRegs.push_back(SegReg);
5352   }
5353 
5354   Register DstReg = MI.getOperand(0).getReg();
5355   if (MRI.getType(DstReg).isVector())
5356     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5357   else if (DstRegs.size() > 1)
5358     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5359   else
5360     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5361   MI.eraseFromParent();
5362   return Legalized;
5363 }
5364 
5365 LegalizerHelper::LegalizeResult
5366 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5367                                     LLT NarrowTy) {
5368   // FIXME: Don't know how to handle secondary types yet.
5369   if (TypeIdx != 0)
5370     return UnableToLegalize;
5371 
5372   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5373   SmallVector<uint64_t, 2> Indexes;
5374   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5375   LLT LeftoverTy;
5376   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5377                LeftoverRegs);
5378 
5379   for (Register Reg : LeftoverRegs)
5380     SrcRegs.push_back(Reg);
5381 
5382   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5383   Register OpReg = MI.getOperand(2).getReg();
5384   uint64_t OpStart = MI.getOperand(3).getImm();
5385   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5386   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5387     unsigned DstStart = I * NarrowSize;
5388 
5389     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5390       // The entire subregister is defined by this insert, forward the new
5391       // value.
5392       DstRegs.push_back(OpReg);
5393       continue;
5394     }
5395 
5396     Register SrcReg = SrcRegs[I];
5397     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5398       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5399       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5400       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5401     }
5402 
5403     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5404       // No part of the insert affects this subregister, forward the original.
5405       DstRegs.push_back(SrcReg);
5406       continue;
5407     }
5408 
5409     // OpSegStart is where this destination segment would start in OpReg if it
5410     // extended infinitely in both directions.
5411     int64_t ExtractOffset, InsertOffset;
5412     uint64_t SegSize;
5413     if (OpStart < DstStart) {
5414       InsertOffset = 0;
5415       ExtractOffset = DstStart - OpStart;
5416       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5417     } else {
5418       InsertOffset = OpStart - DstStart;
5419       ExtractOffset = 0;
5420       SegSize =
5421         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5422     }
5423 
5424     Register SegReg = OpReg;
5425     if (ExtractOffset != 0 || SegSize != OpSize) {
5426       // A genuine extract is needed.
5427       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5428       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5429     }
5430 
5431     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5432     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5433     DstRegs.push_back(DstReg);
5434   }
5435 
5436   uint64_t WideSize = DstRegs.size() * NarrowSize;
5437   Register DstReg = MI.getOperand(0).getReg();
5438   if (WideSize > RegTy.getSizeInBits()) {
5439     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5440     MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
5441     MIRBuilder.buildTrunc(DstReg, MergeReg);
5442   } else
5443     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5444 
5445   MI.eraseFromParent();
5446   return Legalized;
5447 }
5448 
5449 LegalizerHelper::LegalizeResult
5450 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5451                                    LLT NarrowTy) {
5452   Register DstReg = MI.getOperand(0).getReg();
5453   LLT DstTy = MRI.getType(DstReg);
5454 
5455   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5456 
5457   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5458   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5459   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5460   LLT LeftoverTy;
5461   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5462                     Src0Regs, Src0LeftoverRegs))
5463     return UnableToLegalize;
5464 
5465   LLT Unused;
5466   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5467                     Src1Regs, Src1LeftoverRegs))
5468     llvm_unreachable("inconsistent extractParts result");
5469 
5470   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5471     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5472                                         {Src0Regs[I], Src1Regs[I]});
5473     DstRegs.push_back(Inst.getReg(0));
5474   }
5475 
5476   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5477     auto Inst = MIRBuilder.buildInstr(
5478       MI.getOpcode(),
5479       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5480     DstLeftoverRegs.push_back(Inst.getReg(0));
5481   }
5482 
5483   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5484               LeftoverTy, DstLeftoverRegs);
5485 
5486   MI.eraseFromParent();
5487   return Legalized;
5488 }
5489 
5490 LegalizerHelper::LegalizeResult
5491 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5492                                  LLT NarrowTy) {
5493   if (TypeIdx != 0)
5494     return UnableToLegalize;
5495 
5496   auto [DstReg, SrcReg] = MI.getFirst2Regs();
5497 
5498   LLT DstTy = MRI.getType(DstReg);
5499   if (DstTy.isVector())
5500     return UnableToLegalize;
5501 
5502   SmallVector<Register, 8> Parts;
5503   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5504   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5505   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5506 
5507   MI.eraseFromParent();
5508   return Legalized;
5509 }
5510 
5511 LegalizerHelper::LegalizeResult
5512 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5513                                     LLT NarrowTy) {
5514   if (TypeIdx != 0)
5515     return UnableToLegalize;
5516 
5517   Register CondReg = MI.getOperand(1).getReg();
5518   LLT CondTy = MRI.getType(CondReg);
5519   if (CondTy.isVector()) // TODO: Handle vselect
5520     return UnableToLegalize;
5521 
5522   Register DstReg = MI.getOperand(0).getReg();
5523   LLT DstTy = MRI.getType(DstReg);
5524 
5525   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5526   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5527   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5528   LLT LeftoverTy;
5529   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5530                     Src1Regs, Src1LeftoverRegs))
5531     return UnableToLegalize;
5532 
5533   LLT Unused;
5534   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5535                     Src2Regs, Src2LeftoverRegs))
5536     llvm_unreachable("inconsistent extractParts result");
5537 
5538   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5539     auto Select = MIRBuilder.buildSelect(NarrowTy,
5540                                          CondReg, Src1Regs[I], Src2Regs[I]);
5541     DstRegs.push_back(Select.getReg(0));
5542   }
5543 
5544   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5545     auto Select = MIRBuilder.buildSelect(
5546       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5547     DstLeftoverRegs.push_back(Select.getReg(0));
5548   }
5549 
5550   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5551               LeftoverTy, DstLeftoverRegs);
5552 
5553   MI.eraseFromParent();
5554   return Legalized;
5555 }
5556 
5557 LegalizerHelper::LegalizeResult
5558 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5559                                   LLT NarrowTy) {
5560   if (TypeIdx != 1)
5561     return UnableToLegalize;
5562 
5563   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5564   unsigned NarrowSize = NarrowTy.getSizeInBits();
5565 
5566   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5567     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5568 
5569     MachineIRBuilder &B = MIRBuilder;
5570     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5571     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5572     auto C_0 = B.buildConstant(NarrowTy, 0);
5573     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5574                                 UnmergeSrc.getReg(1), C_0);
5575     auto LoCTLZ = IsUndef ?
5576       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5577       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5578     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5579     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5580     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5581     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5582 
5583     MI.eraseFromParent();
5584     return Legalized;
5585   }
5586 
5587   return UnableToLegalize;
5588 }
5589 
5590 LegalizerHelper::LegalizeResult
5591 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5592                                   LLT NarrowTy) {
5593   if (TypeIdx != 1)
5594     return UnableToLegalize;
5595 
5596   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5597   unsigned NarrowSize = NarrowTy.getSizeInBits();
5598 
5599   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5600     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5601 
5602     MachineIRBuilder &B = MIRBuilder;
5603     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5604     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5605     auto C_0 = B.buildConstant(NarrowTy, 0);
5606     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5607                                 UnmergeSrc.getReg(0), C_0);
5608     auto HiCTTZ = IsUndef ?
5609       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5610       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5611     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5612     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5613     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5614     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5615 
5616     MI.eraseFromParent();
5617     return Legalized;
5618   }
5619 
5620   return UnableToLegalize;
5621 }
5622 
5623 LegalizerHelper::LegalizeResult
5624 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5625                                    LLT NarrowTy) {
5626   if (TypeIdx != 1)
5627     return UnableToLegalize;
5628 
5629   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5630   unsigned NarrowSize = NarrowTy.getSizeInBits();
5631 
5632   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5633     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
5634 
5635     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
5636     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
5637     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
5638 
5639     MI.eraseFromParent();
5640     return Legalized;
5641   }
5642 
5643   return UnableToLegalize;
5644 }
5645 
5646 LegalizerHelper::LegalizeResult
5647 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
5648                                     LLT NarrowTy) {
5649   if (TypeIdx != 1)
5650     return UnableToLegalize;
5651 
5652   MachineIRBuilder &B = MIRBuilder;
5653   Register ExpReg = MI.getOperand(2).getReg();
5654   LLT ExpTy = MRI.getType(ExpReg);
5655 
5656   unsigned ClampSize = NarrowTy.getScalarSizeInBits();
5657 
5658   // Clamp the exponent to the range of the target type.
5659   auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
5660   auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
5661   auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
5662   auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
5663 
5664   auto Trunc = B.buildTrunc(NarrowTy, Clamp);
5665   Observer.changingInstr(MI);
5666   MI.getOperand(2).setReg(Trunc.getReg(0));
5667   Observer.changedInstr(MI);
5668   return Legalized;
5669 }
5670 
5671 LegalizerHelper::LegalizeResult
5672 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
5673   unsigned Opc = MI.getOpcode();
5674   const auto &TII = MIRBuilder.getTII();
5675   auto isSupported = [this](const LegalityQuery &Q) {
5676     auto QAction = LI.getAction(Q).Action;
5677     return QAction == Legal || QAction == Libcall || QAction == Custom;
5678   };
5679   switch (Opc) {
5680   default:
5681     return UnableToLegalize;
5682   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
5683     // This trivially expands to CTLZ.
5684     Observer.changingInstr(MI);
5685     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
5686     Observer.changedInstr(MI);
5687     return Legalized;
5688   }
5689   case TargetOpcode::G_CTLZ: {
5690     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5691     unsigned Len = SrcTy.getSizeInBits();
5692 
5693     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5694       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
5695       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
5696       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
5697       auto ICmp = MIRBuilder.buildICmp(
5698           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
5699       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5700       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
5701       MI.eraseFromParent();
5702       return Legalized;
5703     }
5704     // for now, we do this:
5705     // NewLen = NextPowerOf2(Len);
5706     // x = x | (x >> 1);
5707     // x = x | (x >> 2);
5708     // ...
5709     // x = x | (x >>16);
5710     // x = x | (x >>32); // for 64-bit input
5711     // Upto NewLen/2
5712     // return Len - popcount(x);
5713     //
5714     // Ref: "Hacker's Delight" by Henry Warren
5715     Register Op = SrcReg;
5716     unsigned NewLen = PowerOf2Ceil(Len);
5717     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
5718       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
5719       auto MIBOp = MIRBuilder.buildOr(
5720           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
5721       Op = MIBOp.getReg(0);
5722     }
5723     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
5724     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
5725                         MIBPop);
5726     MI.eraseFromParent();
5727     return Legalized;
5728   }
5729   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
5730     // This trivially expands to CTTZ.
5731     Observer.changingInstr(MI);
5732     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
5733     Observer.changedInstr(MI);
5734     return Legalized;
5735   }
5736   case TargetOpcode::G_CTTZ: {
5737     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5738 
5739     unsigned Len = SrcTy.getSizeInBits();
5740     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
5741       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
5742       // zero.
5743       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
5744       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
5745       auto ICmp = MIRBuilder.buildICmp(
5746           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
5747       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
5748       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
5749       MI.eraseFromParent();
5750       return Legalized;
5751     }
5752     // for now, we use: { return popcount(~x & (x - 1)); }
5753     // unless the target has ctlz but not ctpop, in which case we use:
5754     // { return 32 - nlz(~x & (x-1)); }
5755     // Ref: "Hacker's Delight" by Henry Warren
5756     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
5757     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
5758     auto MIBTmp = MIRBuilder.buildAnd(
5759         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
5760     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
5761         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
5762       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
5763       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
5764                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
5765       MI.eraseFromParent();
5766       return Legalized;
5767     }
5768     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
5769     MI.getOperand(1).setReg(MIBTmp.getReg(0));
5770     return Legalized;
5771   }
5772   case TargetOpcode::G_CTPOP: {
5773     Register SrcReg = MI.getOperand(1).getReg();
5774     LLT Ty = MRI.getType(SrcReg);
5775     unsigned Size = Ty.getSizeInBits();
5776     MachineIRBuilder &B = MIRBuilder;
5777 
5778     // Count set bits in blocks of 2 bits. Default approach would be
5779     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
5780     // We use following formula instead:
5781     // B2Count = val - { (val >> 1) & 0x55555555 }
5782     // since it gives same result in blocks of 2 with one instruction less.
5783     auto C_1 = B.buildConstant(Ty, 1);
5784     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
5785     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
5786     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
5787     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
5788     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
5789 
5790     // In order to get count in blocks of 4 add values from adjacent block of 2.
5791     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
5792     auto C_2 = B.buildConstant(Ty, 2);
5793     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
5794     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
5795     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
5796     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
5797     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
5798     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
5799 
5800     // For count in blocks of 8 bits we don't have to mask high 4 bits before
5801     // addition since count value sits in range {0,...,8} and 4 bits are enough
5802     // to hold such binary values. After addition high 4 bits still hold count
5803     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
5804     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
5805     auto C_4 = B.buildConstant(Ty, 4);
5806     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
5807     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
5808     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
5809     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
5810     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
5811 
5812     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
5813     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
5814     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
5815     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
5816     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
5817 
5818     // Shift count result from 8 high bits to low bits.
5819     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
5820     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
5821 
5822     MI.eraseFromParent();
5823     return Legalized;
5824   }
5825   }
5826 }
5827 
5828 // Check that (every element of) Reg is undef or not an exact multiple of BW.
5829 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
5830                                         Register Reg, unsigned BW) {
5831   return matchUnaryPredicate(
5832       MRI, Reg,
5833       [=](const Constant *C) {
5834         // Null constant here means an undef.
5835         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
5836         return !CI || CI->getValue().urem(BW) != 0;
5837       },
5838       /*AllowUndefs*/ true);
5839 }
5840 
5841 LegalizerHelper::LegalizeResult
5842 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
5843   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
5844   LLT Ty = MRI.getType(Dst);
5845   LLT ShTy = MRI.getType(Z);
5846 
5847   unsigned BW = Ty.getScalarSizeInBits();
5848 
5849   if (!isPowerOf2_32(BW))
5850     return UnableToLegalize;
5851 
5852   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5853   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5854 
5855   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5856     // fshl X, Y, Z -> fshr X, Y, -Z
5857     // fshr X, Y, Z -> fshl X, Y, -Z
5858     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
5859     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
5860   } else {
5861     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
5862     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
5863     auto One = MIRBuilder.buildConstant(ShTy, 1);
5864     if (IsFSHL) {
5865       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5866       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
5867     } else {
5868       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
5869       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
5870     }
5871 
5872     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
5873   }
5874 
5875   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
5876   MI.eraseFromParent();
5877   return Legalized;
5878 }
5879 
5880 LegalizerHelper::LegalizeResult
5881 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
5882   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
5883   LLT Ty = MRI.getType(Dst);
5884   LLT ShTy = MRI.getType(Z);
5885 
5886   const unsigned BW = Ty.getScalarSizeInBits();
5887   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5888 
5889   Register ShX, ShY;
5890   Register ShAmt, InvShAmt;
5891 
5892   // FIXME: Emit optimized urem by constant instead of letting it expand later.
5893   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
5894     // fshl: X << C | Y >> (BW - C)
5895     // fshr: X << (BW - C) | Y >> C
5896     // where C = Z % BW is not zero
5897     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5898     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5899     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
5900     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
5901     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
5902   } else {
5903     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
5904     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
5905     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
5906     if (isPowerOf2_32(BW)) {
5907       // Z % BW -> Z & (BW - 1)
5908       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
5909       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
5910       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
5911       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
5912     } else {
5913       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
5914       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
5915       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
5916     }
5917 
5918     auto One = MIRBuilder.buildConstant(ShTy, 1);
5919     if (IsFSHL) {
5920       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
5921       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
5922       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
5923     } else {
5924       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
5925       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
5926       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
5927     }
5928   }
5929 
5930   MIRBuilder.buildOr(Dst, ShX, ShY);
5931   MI.eraseFromParent();
5932   return Legalized;
5933 }
5934 
5935 LegalizerHelper::LegalizeResult
5936 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
5937   // These operations approximately do the following (while avoiding undefined
5938   // shifts by BW):
5939   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
5940   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
5941   Register Dst = MI.getOperand(0).getReg();
5942   LLT Ty = MRI.getType(Dst);
5943   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
5944 
5945   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
5946   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
5947 
5948   // TODO: Use smarter heuristic that accounts for vector legalization.
5949   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
5950     return lowerFunnelShiftAsShifts(MI);
5951 
5952   // This only works for powers of 2, fallback to shifts if it fails.
5953   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
5954   if (Result == UnableToLegalize)
5955     return lowerFunnelShiftAsShifts(MI);
5956   return Result;
5957 }
5958 
5959 LegalizerHelper::LegalizeResult
5960 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
5961   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
5962   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
5963   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5964   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5965   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
5966   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
5967   MI.eraseFromParent();
5968   return Legalized;
5969 }
5970 
5971 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
5972   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
5973 
5974   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
5975   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
5976 
5977   MIRBuilder.setInstrAndDebugLoc(MI);
5978 
5979   // If a rotate in the other direction is supported, use it.
5980   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
5981   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
5982       isPowerOf2_32(EltSizeInBits))
5983     return lowerRotateWithReverseRotate(MI);
5984 
5985   // If a funnel shift is supported, use it.
5986   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5987   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
5988   bool IsFShLegal = false;
5989   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
5990       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
5991     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
5992                                 Register R3) {
5993       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
5994       MI.eraseFromParent();
5995       return Legalized;
5996     };
5997     // If a funnel shift in the other direction is supported, use it.
5998     if (IsFShLegal) {
5999       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6000     } else if (isPowerOf2_32(EltSizeInBits)) {
6001       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6002       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6003     }
6004   }
6005 
6006   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6007   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6008   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6009   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6010   Register ShVal;
6011   Register RevShiftVal;
6012   if (isPowerOf2_32(EltSizeInBits)) {
6013     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6014     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6015     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6016     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6017     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6018     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6019     RevShiftVal =
6020         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6021   } else {
6022     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6023     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6024     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6025     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6026     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6027     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6028     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6029     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6030     RevShiftVal =
6031         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6032   }
6033   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6034   MI.eraseFromParent();
6035   return Legalized;
6036 }
6037 
6038 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6039 // representation.
6040 LegalizerHelper::LegalizeResult
6041 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6042   auto [Dst, Src] = MI.getFirst2Regs();
6043   const LLT S64 = LLT::scalar(64);
6044   const LLT S32 = LLT::scalar(32);
6045   const LLT S1 = LLT::scalar(1);
6046 
6047   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6048 
6049   // unsigned cul2f(ulong u) {
6050   //   uint lz = clz(u);
6051   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6052   //   u = (u << lz) & 0x7fffffffffffffffUL;
6053   //   ulong t = u & 0xffffffffffUL;
6054   //   uint v = (e << 23) | (uint)(u >> 40);
6055   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6056   //   return as_float(v + r);
6057   // }
6058 
6059   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6060   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6061 
6062   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6063 
6064   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6065   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6066 
6067   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6068   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6069 
6070   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6071   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6072 
6073   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6074 
6075   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6076   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6077 
6078   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6079   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6080   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6081 
6082   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6083   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6084   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6085   auto One = MIRBuilder.buildConstant(S32, 1);
6086 
6087   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6088   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6089   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6090   MIRBuilder.buildAdd(Dst, V, R);
6091 
6092   MI.eraseFromParent();
6093   return Legalized;
6094 }
6095 
6096 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6097   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6098 
6099   if (SrcTy == LLT::scalar(1)) {
6100     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6101     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6102     MIRBuilder.buildSelect(Dst, Src, True, False);
6103     MI.eraseFromParent();
6104     return Legalized;
6105   }
6106 
6107   if (SrcTy != LLT::scalar(64))
6108     return UnableToLegalize;
6109 
6110   if (DstTy == LLT::scalar(32)) {
6111     // TODO: SelectionDAG has several alternative expansions to port which may
6112     // be more reasonble depending on the available instructions. If a target
6113     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6114     // intermediate type, this is probably worse.
6115     return lowerU64ToF32BitOps(MI);
6116   }
6117 
6118   return UnableToLegalize;
6119 }
6120 
6121 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6122   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6123 
6124   const LLT S64 = LLT::scalar(64);
6125   const LLT S32 = LLT::scalar(32);
6126   const LLT S1 = LLT::scalar(1);
6127 
6128   if (SrcTy == S1) {
6129     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6130     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6131     MIRBuilder.buildSelect(Dst, Src, True, False);
6132     MI.eraseFromParent();
6133     return Legalized;
6134   }
6135 
6136   if (SrcTy != S64)
6137     return UnableToLegalize;
6138 
6139   if (DstTy == S32) {
6140     // signed cl2f(long l) {
6141     //   long s = l >> 63;
6142     //   float r = cul2f((l + s) ^ s);
6143     //   return s ? -r : r;
6144     // }
6145     Register L = Src;
6146     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6147     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6148 
6149     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6150     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6151     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6152 
6153     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6154     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6155                                             MIRBuilder.buildConstant(S64, 0));
6156     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6157     MI.eraseFromParent();
6158     return Legalized;
6159   }
6160 
6161   return UnableToLegalize;
6162 }
6163 
6164 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6165   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6166   const LLT S64 = LLT::scalar(64);
6167   const LLT S32 = LLT::scalar(32);
6168 
6169   if (SrcTy != S64 && SrcTy != S32)
6170     return UnableToLegalize;
6171   if (DstTy != S32 && DstTy != S64)
6172     return UnableToLegalize;
6173 
6174   // FPTOSI gives same result as FPTOUI for positive signed integers.
6175   // FPTOUI needs to deal with fp values that convert to unsigned integers
6176   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6177 
6178   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6179   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6180                                                 : APFloat::IEEEdouble(),
6181                     APInt::getZero(SrcTy.getSizeInBits()));
6182   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6183 
6184   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6185 
6186   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6187   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6188   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6189   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6190   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6191   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6192   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6193 
6194   const LLT S1 = LLT::scalar(1);
6195 
6196   MachineInstrBuilder FCMP =
6197       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6198   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6199 
6200   MI.eraseFromParent();
6201   return Legalized;
6202 }
6203 
6204 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6205   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6206   const LLT S64 = LLT::scalar(64);
6207   const LLT S32 = LLT::scalar(32);
6208 
6209   // FIXME: Only f32 to i64 conversions are supported.
6210   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6211     return UnableToLegalize;
6212 
6213   // Expand f32 -> i64 conversion
6214   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6215   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6216 
6217   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6218 
6219   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6220   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6221 
6222   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6223   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6224 
6225   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6226                                            APInt::getSignMask(SrcEltBits));
6227   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6228   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6229   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6230   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6231 
6232   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6233   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6234   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6235 
6236   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6237   R = MIRBuilder.buildZExt(DstTy, R);
6238 
6239   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6240   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6241   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6242   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6243 
6244   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6245   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6246 
6247   const LLT S1 = LLT::scalar(1);
6248   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6249                                     S1, Exponent, ExponentLoBit);
6250 
6251   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6252 
6253   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6254   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6255 
6256   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6257 
6258   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6259                                           S1, Exponent, ZeroSrcTy);
6260 
6261   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6262   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6263 
6264   MI.eraseFromParent();
6265   return Legalized;
6266 }
6267 
6268 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6269 LegalizerHelper::LegalizeResult
6270 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6271   const LLT S1 = LLT::scalar(1);
6272   const LLT S32 = LLT::scalar(32);
6273 
6274   auto [Dst, Src] = MI.getFirst2Regs();
6275   assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
6276          MRI.getType(Src).getScalarType() == LLT::scalar(64));
6277 
6278   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6279     return UnableToLegalize;
6280 
6281   if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
6282     unsigned Flags = MI.getFlags();
6283     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
6284     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
6285     MI.eraseFromParent();
6286     return Legalized;
6287   }
6288 
6289   const unsigned ExpMask = 0x7ff;
6290   const unsigned ExpBiasf64 = 1023;
6291   const unsigned ExpBiasf16 = 15;
6292 
6293   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6294   Register U = Unmerge.getReg(0);
6295   Register UH = Unmerge.getReg(1);
6296 
6297   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6298   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6299 
6300   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6301   // add the f16 bias (15) to get the biased exponent for the f16 format.
6302   E = MIRBuilder.buildAdd(
6303     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6304 
6305   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6306   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6307 
6308   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6309                                        MIRBuilder.buildConstant(S32, 0x1ff));
6310   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6311 
6312   auto Zero = MIRBuilder.buildConstant(S32, 0);
6313   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6314   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6315   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6316 
6317   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6318   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6319   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6320   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6321 
6322   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6323   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6324 
6325   // N = M | (E << 12);
6326   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6327   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6328 
6329   // B = clamp(1-E, 0, 13);
6330   auto One = MIRBuilder.buildConstant(S32, 1);
6331   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6332   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6333   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6334 
6335   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6336                                        MIRBuilder.buildConstant(S32, 0x1000));
6337 
6338   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6339   auto D0 = MIRBuilder.buildShl(S32, D, B);
6340 
6341   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6342                                              D0, SigSetHigh);
6343   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6344   D = MIRBuilder.buildOr(S32, D, D1);
6345 
6346   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6347   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6348 
6349   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6350   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6351 
6352   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6353                                        MIRBuilder.buildConstant(S32, 3));
6354   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6355 
6356   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6357                                        MIRBuilder.buildConstant(S32, 5));
6358   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6359 
6360   V1 = MIRBuilder.buildOr(S32, V0, V1);
6361   V = MIRBuilder.buildAdd(S32, V, V1);
6362 
6363   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6364                                        E, MIRBuilder.buildConstant(S32, 30));
6365   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6366                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6367 
6368   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6369                                          E, MIRBuilder.buildConstant(S32, 1039));
6370   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6371 
6372   // Extract the sign bit.
6373   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6374   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6375 
6376   // Insert the sign bit
6377   V = MIRBuilder.buildOr(S32, Sign, V);
6378 
6379   MIRBuilder.buildTrunc(Dst, V);
6380   MI.eraseFromParent();
6381   return Legalized;
6382 }
6383 
6384 LegalizerHelper::LegalizeResult
6385 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6386   auto [DstTy, SrcTy] = MI.getFirst2LLTs();
6387   const LLT S64 = LLT::scalar(64);
6388   const LLT S16 = LLT::scalar(16);
6389 
6390   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6391     return lowerFPTRUNC_F64_TO_F16(MI);
6392 
6393   return UnableToLegalize;
6394 }
6395 
6396 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6397 // multiplication tree.
6398 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6399   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6400   LLT Ty = MRI.getType(Dst);
6401 
6402   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6403   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6404   MI.eraseFromParent();
6405   return Legalized;
6406 }
6407 
6408 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6409   switch (Opc) {
6410   case TargetOpcode::G_SMIN:
6411     return CmpInst::ICMP_SLT;
6412   case TargetOpcode::G_SMAX:
6413     return CmpInst::ICMP_SGT;
6414   case TargetOpcode::G_UMIN:
6415     return CmpInst::ICMP_ULT;
6416   case TargetOpcode::G_UMAX:
6417     return CmpInst::ICMP_UGT;
6418   default:
6419     llvm_unreachable("not in integer min/max");
6420   }
6421 }
6422 
6423 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6424   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6425 
6426   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6427   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6428 
6429   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6430   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6431 
6432   MI.eraseFromParent();
6433   return Legalized;
6434 }
6435 
6436 LegalizerHelper::LegalizeResult
6437 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6438   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
6439   const int Src0Size = Src0Ty.getScalarSizeInBits();
6440   const int Src1Size = Src1Ty.getScalarSizeInBits();
6441 
6442   auto SignBitMask = MIRBuilder.buildConstant(
6443     Src0Ty, APInt::getSignMask(Src0Size));
6444 
6445   auto NotSignBitMask = MIRBuilder.buildConstant(
6446     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6447 
6448   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6449   Register And1;
6450   if (Src0Ty == Src1Ty) {
6451     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6452   } else if (Src0Size > Src1Size) {
6453     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6454     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6455     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6456     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6457   } else {
6458     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6459     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6460     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6461     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6462   }
6463 
6464   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6465   // constants are a nan and -0.0, but the final result should preserve
6466   // everything.
6467   unsigned Flags = MI.getFlags();
6468   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6469 
6470   MI.eraseFromParent();
6471   return Legalized;
6472 }
6473 
6474 LegalizerHelper::LegalizeResult
6475 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6476   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6477     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6478 
6479   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6480   LLT Ty = MRI.getType(Dst);
6481 
6482   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6483     // Insert canonicalizes if it's possible we need to quiet to get correct
6484     // sNaN behavior.
6485 
6486     // Note this must be done here, and not as an optimization combine in the
6487     // absence of a dedicate quiet-snan instruction as we're using an
6488     // omni-purpose G_FCANONICALIZE.
6489     if (!isKnownNeverSNaN(Src0, MRI))
6490       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6491 
6492     if (!isKnownNeverSNaN(Src1, MRI))
6493       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6494   }
6495 
6496   // If there are no nans, it's safe to simply replace this with the non-IEEE
6497   // version.
6498   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6499   MI.eraseFromParent();
6500   return Legalized;
6501 }
6502 
6503 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6504   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6505   Register DstReg = MI.getOperand(0).getReg();
6506   LLT Ty = MRI.getType(DstReg);
6507   unsigned Flags = MI.getFlags();
6508 
6509   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6510                                   Flags);
6511   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6512   MI.eraseFromParent();
6513   return Legalized;
6514 }
6515 
6516 LegalizerHelper::LegalizeResult
6517 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6518   auto [DstReg, X] = MI.getFirst2Regs();
6519   const unsigned Flags = MI.getFlags();
6520   const LLT Ty = MRI.getType(DstReg);
6521   const LLT CondTy = Ty.changeElementSize(1);
6522 
6523   // round(x) =>
6524   //  t = trunc(x);
6525   //  d = fabs(x - t);
6526   //  o = copysign(1.0f, x);
6527   //  return t + (d >= 0.5 ? o : 0.0);
6528 
6529   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
6530 
6531   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
6532   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
6533   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6534   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
6535   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
6536   auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
6537 
6538   auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
6539                                   Flags);
6540   auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
6541 
6542   MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
6543 
6544   MI.eraseFromParent();
6545   return Legalized;
6546 }
6547 
6548 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
6549   auto [DstReg, SrcReg] = MI.getFirst2Regs();
6550   unsigned Flags = MI.getFlags();
6551   LLT Ty = MRI.getType(DstReg);
6552   const LLT CondTy = Ty.changeElementSize(1);
6553 
6554   // result = trunc(src);
6555   // if (src < 0.0 && src != result)
6556   //   result += -1.0.
6557 
6558   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
6559   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
6560 
6561   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
6562                                   SrcReg, Zero, Flags);
6563   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
6564                                       SrcReg, Trunc, Flags);
6565   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
6566   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
6567 
6568   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
6569   MI.eraseFromParent();
6570   return Legalized;
6571 }
6572 
6573 LegalizerHelper::LegalizeResult
6574 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
6575   const unsigned NumOps = MI.getNumOperands();
6576   auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
6577   unsigned PartSize = Src0Ty.getSizeInBits();
6578 
6579   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
6580   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
6581 
6582   for (unsigned I = 2; I != NumOps; ++I) {
6583     const unsigned Offset = (I - 1) * PartSize;
6584 
6585     Register SrcReg = MI.getOperand(I).getReg();
6586     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
6587 
6588     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
6589       MRI.createGenericVirtualRegister(WideTy);
6590 
6591     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
6592     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
6593     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
6594     ResultReg = NextResult;
6595   }
6596 
6597   if (DstTy.isPointer()) {
6598     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
6599           DstTy.getAddressSpace())) {
6600       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
6601       return UnableToLegalize;
6602     }
6603 
6604     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
6605   }
6606 
6607   MI.eraseFromParent();
6608   return Legalized;
6609 }
6610 
6611 LegalizerHelper::LegalizeResult
6612 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
6613   const unsigned NumDst = MI.getNumOperands() - 1;
6614   Register SrcReg = MI.getOperand(NumDst).getReg();
6615   Register Dst0Reg = MI.getOperand(0).getReg();
6616   LLT DstTy = MRI.getType(Dst0Reg);
6617   if (DstTy.isPointer())
6618     return UnableToLegalize; // TODO
6619 
6620   SrcReg = coerceToScalar(SrcReg);
6621   if (!SrcReg)
6622     return UnableToLegalize;
6623 
6624   // Expand scalarizing unmerge as bitcast to integer and shift.
6625   LLT IntTy = MRI.getType(SrcReg);
6626 
6627   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
6628 
6629   const unsigned DstSize = DstTy.getSizeInBits();
6630   unsigned Offset = DstSize;
6631   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
6632     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
6633     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
6634     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
6635   }
6636 
6637   MI.eraseFromParent();
6638   return Legalized;
6639 }
6640 
6641 /// Lower a vector extract or insert by writing the vector to a stack temporary
6642 /// and reloading the element or vector.
6643 ///
6644 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
6645 ///  =>
6646 ///  %stack_temp = G_FRAME_INDEX
6647 ///  G_STORE %vec, %stack_temp
6648 ///  %idx = clamp(%idx, %vec.getNumElements())
6649 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
6650 ///  %dst = G_LOAD %element_ptr
6651 LegalizerHelper::LegalizeResult
6652 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
6653   Register DstReg = MI.getOperand(0).getReg();
6654   Register SrcVec = MI.getOperand(1).getReg();
6655   Register InsertVal;
6656   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
6657     InsertVal = MI.getOperand(2).getReg();
6658 
6659   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
6660 
6661   LLT VecTy = MRI.getType(SrcVec);
6662   LLT EltTy = VecTy.getElementType();
6663   unsigned NumElts = VecTy.getNumElements();
6664 
6665   int64_t IdxVal;
6666   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
6667     SmallVector<Register, 8> SrcRegs;
6668     extractParts(SrcVec, EltTy, NumElts, SrcRegs);
6669 
6670     if (InsertVal) {
6671       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
6672       MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
6673     } else {
6674       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
6675     }
6676 
6677     MI.eraseFromParent();
6678     return Legalized;
6679   }
6680 
6681   if (!EltTy.isByteSized()) { // Not implemented.
6682     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
6683     return UnableToLegalize;
6684   }
6685 
6686   unsigned EltBytes = EltTy.getSizeInBytes();
6687   Align VecAlign = getStackTemporaryAlignment(VecTy);
6688   Align EltAlign;
6689 
6690   MachinePointerInfo PtrInfo;
6691   auto StackTemp = createStackTemporary(TypeSize::Fixed(VecTy.getSizeInBytes()),
6692                                         VecAlign, PtrInfo);
6693   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
6694 
6695   // Get the pointer to the element, and be sure not to hit undefined behavior
6696   // if the index is out of bounds.
6697   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
6698 
6699   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
6700     int64_t Offset = IdxVal * EltBytes;
6701     PtrInfo = PtrInfo.getWithOffset(Offset);
6702     EltAlign = commonAlignment(VecAlign, Offset);
6703   } else {
6704     // We lose information with a variable offset.
6705     EltAlign = getStackTemporaryAlignment(EltTy);
6706     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
6707   }
6708 
6709   if (InsertVal) {
6710     // Write the inserted element
6711     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
6712 
6713     // Reload the whole vector.
6714     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
6715   } else {
6716     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
6717   }
6718 
6719   MI.eraseFromParent();
6720   return Legalized;
6721 }
6722 
6723 LegalizerHelper::LegalizeResult
6724 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
6725   auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
6726       MI.getFirst3RegLLTs();
6727   LLT IdxTy = LLT::scalar(32);
6728 
6729   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6730 
6731   if (DstTy.isScalar()) {
6732     if (Src0Ty.isVector())
6733       return UnableToLegalize;
6734 
6735     // This is just a SELECT.
6736     assert(Mask.size() == 1 && "Expected a single mask element");
6737     Register Val;
6738     if (Mask[0] < 0 || Mask[0] > 1)
6739       Val = MIRBuilder.buildUndef(DstTy).getReg(0);
6740     else
6741       Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
6742     MIRBuilder.buildCopy(DstReg, Val);
6743     MI.eraseFromParent();
6744     return Legalized;
6745   }
6746 
6747   Register Undef;
6748   SmallVector<Register, 32> BuildVec;
6749   LLT EltTy = DstTy.getElementType();
6750 
6751   for (int Idx : Mask) {
6752     if (Idx < 0) {
6753       if (!Undef.isValid())
6754         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
6755       BuildVec.push_back(Undef);
6756       continue;
6757     }
6758 
6759     if (Src0Ty.isScalar()) {
6760       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
6761     } else {
6762       int NumElts = Src0Ty.getNumElements();
6763       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
6764       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
6765       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
6766       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
6767       BuildVec.push_back(Extract.getReg(0));
6768     }
6769   }
6770 
6771   MIRBuilder.buildBuildVector(DstReg, BuildVec);
6772   MI.eraseFromParent();
6773   return Legalized;
6774 }
6775 
6776 LegalizerHelper::LegalizeResult
6777 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
6778   const auto &MF = *MI.getMF();
6779   const auto &TFI = *MF.getSubtarget().getFrameLowering();
6780   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
6781     return UnableToLegalize;
6782 
6783   Register Dst = MI.getOperand(0).getReg();
6784   Register AllocSize = MI.getOperand(1).getReg();
6785   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
6786 
6787   LLT PtrTy = MRI.getType(Dst);
6788   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
6789 
6790   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
6791   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
6792   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
6793 
6794   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
6795   // have to generate an extra instruction to negate the alloc and then use
6796   // G_PTR_ADD to add the negative offset.
6797   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
6798   if (Alignment > Align(1)) {
6799     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
6800     AlignMask.negate();
6801     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
6802     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
6803   }
6804 
6805   SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
6806   MIRBuilder.buildCopy(SPReg, SPTmp);
6807   MIRBuilder.buildCopy(Dst, SPTmp);
6808 
6809   MI.eraseFromParent();
6810   return Legalized;
6811 }
6812 
6813 LegalizerHelper::LegalizeResult
6814 LegalizerHelper::lowerExtract(MachineInstr &MI) {
6815   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6816   unsigned Offset = MI.getOperand(2).getImm();
6817 
6818   // Extract sub-vector or one element
6819   if (SrcTy.isVector()) {
6820     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
6821     unsigned DstSize = DstTy.getSizeInBits();
6822 
6823     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
6824         (Offset + DstSize <= SrcTy.getSizeInBits())) {
6825       // Unmerge and allow access to each Src element for the artifact combiner.
6826       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
6827 
6828       // Take element(s) we need to extract and copy it (merge them).
6829       SmallVector<Register, 8> SubVectorElts;
6830       for (unsigned Idx = Offset / SrcEltSize;
6831            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
6832         SubVectorElts.push_back(Unmerge.getReg(Idx));
6833       }
6834       if (SubVectorElts.size() == 1)
6835         MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
6836       else
6837         MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
6838 
6839       MI.eraseFromParent();
6840       return Legalized;
6841     }
6842   }
6843 
6844   if (DstTy.isScalar() &&
6845       (SrcTy.isScalar() ||
6846        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
6847     LLT SrcIntTy = SrcTy;
6848     if (!SrcTy.isScalar()) {
6849       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
6850       SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
6851     }
6852 
6853     if (Offset == 0)
6854       MIRBuilder.buildTrunc(DstReg, SrcReg);
6855     else {
6856       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
6857       auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
6858       MIRBuilder.buildTrunc(DstReg, Shr);
6859     }
6860 
6861     MI.eraseFromParent();
6862     return Legalized;
6863   }
6864 
6865   return UnableToLegalize;
6866 }
6867 
6868 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
6869   auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
6870   uint64_t Offset = MI.getOperand(3).getImm();
6871 
6872   LLT DstTy = MRI.getType(Src);
6873   LLT InsertTy = MRI.getType(InsertSrc);
6874 
6875   // Insert sub-vector or one element
6876   if (DstTy.isVector() && !InsertTy.isPointer()) {
6877     LLT EltTy = DstTy.getElementType();
6878     unsigned EltSize = EltTy.getSizeInBits();
6879     unsigned InsertSize = InsertTy.getSizeInBits();
6880 
6881     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
6882         (Offset + InsertSize <= DstTy.getSizeInBits())) {
6883       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
6884       SmallVector<Register, 8> DstElts;
6885       unsigned Idx = 0;
6886       // Elements from Src before insert start Offset
6887       for (; Idx < Offset / EltSize; ++Idx) {
6888         DstElts.push_back(UnmergeSrc.getReg(Idx));
6889       }
6890 
6891       // Replace elements in Src with elements from InsertSrc
6892       if (InsertTy.getSizeInBits() > EltSize) {
6893         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
6894         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
6895              ++Idx, ++i) {
6896           DstElts.push_back(UnmergeInsertSrc.getReg(i));
6897         }
6898       } else {
6899         DstElts.push_back(InsertSrc);
6900         ++Idx;
6901       }
6902 
6903       // Remaining elements from Src after insert
6904       for (; Idx < DstTy.getNumElements(); ++Idx) {
6905         DstElts.push_back(UnmergeSrc.getReg(Idx));
6906       }
6907 
6908       MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
6909       MI.eraseFromParent();
6910       return Legalized;
6911     }
6912   }
6913 
6914   if (InsertTy.isVector() ||
6915       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
6916     return UnableToLegalize;
6917 
6918   const DataLayout &DL = MIRBuilder.getDataLayout();
6919   if ((DstTy.isPointer() &&
6920        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
6921       (InsertTy.isPointer() &&
6922        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
6923     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
6924     return UnableToLegalize;
6925   }
6926 
6927   LLT IntDstTy = DstTy;
6928 
6929   if (!DstTy.isScalar()) {
6930     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
6931     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
6932   }
6933 
6934   if (!InsertTy.isScalar()) {
6935     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
6936     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
6937   }
6938 
6939   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
6940   if (Offset != 0) {
6941     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
6942     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
6943   }
6944 
6945   APInt MaskVal = APInt::getBitsSetWithWrap(
6946       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
6947 
6948   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
6949   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
6950   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
6951 
6952   MIRBuilder.buildCast(Dst, Or);
6953   MI.eraseFromParent();
6954   return Legalized;
6955 }
6956 
6957 LegalizerHelper::LegalizeResult
6958 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
6959   auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
6960       MI.getFirst4RegLLTs();
6961   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
6962 
6963   LLT Ty = Dst0Ty;
6964   LLT BoolTy = Dst1Ty;
6965 
6966   if (IsAdd)
6967     MIRBuilder.buildAdd(Dst0, LHS, RHS);
6968   else
6969     MIRBuilder.buildSub(Dst0, LHS, RHS);
6970 
6971   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
6972 
6973   auto Zero = MIRBuilder.buildConstant(Ty, 0);
6974 
6975   // For an addition, the result should be less than one of the operands (LHS)
6976   // if and only if the other operand (RHS) is negative, otherwise there will
6977   // be overflow.
6978   // For a subtraction, the result should be less than one of the operands
6979   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
6980   // otherwise there will be overflow.
6981   auto ResultLowerThanLHS =
6982       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
6983   auto ConditionRHS = MIRBuilder.buildICmp(
6984       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
6985 
6986   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
6987   MI.eraseFromParent();
6988   return Legalized;
6989 }
6990 
6991 LegalizerHelper::LegalizeResult
6992 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
6993   auto [Res, LHS, RHS] = MI.getFirst3Regs();
6994   LLT Ty = MRI.getType(Res);
6995   bool IsSigned;
6996   bool IsAdd;
6997   unsigned BaseOp;
6998   switch (MI.getOpcode()) {
6999   default:
7000     llvm_unreachable("unexpected addsat/subsat opcode");
7001   case TargetOpcode::G_UADDSAT:
7002     IsSigned = false;
7003     IsAdd = true;
7004     BaseOp = TargetOpcode::G_ADD;
7005     break;
7006   case TargetOpcode::G_SADDSAT:
7007     IsSigned = true;
7008     IsAdd = true;
7009     BaseOp = TargetOpcode::G_ADD;
7010     break;
7011   case TargetOpcode::G_USUBSAT:
7012     IsSigned = false;
7013     IsAdd = false;
7014     BaseOp = TargetOpcode::G_SUB;
7015     break;
7016   case TargetOpcode::G_SSUBSAT:
7017     IsSigned = true;
7018     IsAdd = false;
7019     BaseOp = TargetOpcode::G_SUB;
7020     break;
7021   }
7022 
7023   if (IsSigned) {
7024     // sadd.sat(a, b) ->
7025     //   hi = 0x7fffffff - smax(a, 0)
7026     //   lo = 0x80000000 - smin(a, 0)
7027     //   a + smin(smax(lo, b), hi)
7028     // ssub.sat(a, b) ->
7029     //   lo = smax(a, -1) - 0x7fffffff
7030     //   hi = smin(a, -1) - 0x80000000
7031     //   a - smin(smax(lo, b), hi)
7032     // TODO: AMDGPU can use a "median of 3" instruction here:
7033     //   a +/- med3(lo, b, hi)
7034     uint64_t NumBits = Ty.getScalarSizeInBits();
7035     auto MaxVal =
7036         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7037     auto MinVal =
7038         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7039     MachineInstrBuilder Hi, Lo;
7040     if (IsAdd) {
7041       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7042       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7043       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7044     } else {
7045       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7046       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7047                                MaxVal);
7048       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7049                                MinVal);
7050     }
7051     auto RHSClamped =
7052         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7053     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7054   } else {
7055     // uadd.sat(a, b) -> a + umin(~a, b)
7056     // usub.sat(a, b) -> a - umin(a, b)
7057     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7058     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7059     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7060   }
7061 
7062   MI.eraseFromParent();
7063   return Legalized;
7064 }
7065 
7066 LegalizerHelper::LegalizeResult
7067 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7068   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7069   LLT Ty = MRI.getType(Res);
7070   LLT BoolTy = Ty.changeElementSize(1);
7071   bool IsSigned;
7072   bool IsAdd;
7073   unsigned OverflowOp;
7074   switch (MI.getOpcode()) {
7075   default:
7076     llvm_unreachable("unexpected addsat/subsat opcode");
7077   case TargetOpcode::G_UADDSAT:
7078     IsSigned = false;
7079     IsAdd = true;
7080     OverflowOp = TargetOpcode::G_UADDO;
7081     break;
7082   case TargetOpcode::G_SADDSAT:
7083     IsSigned = true;
7084     IsAdd = true;
7085     OverflowOp = TargetOpcode::G_SADDO;
7086     break;
7087   case TargetOpcode::G_USUBSAT:
7088     IsSigned = false;
7089     IsAdd = false;
7090     OverflowOp = TargetOpcode::G_USUBO;
7091     break;
7092   case TargetOpcode::G_SSUBSAT:
7093     IsSigned = true;
7094     IsAdd = false;
7095     OverflowOp = TargetOpcode::G_SSUBO;
7096     break;
7097   }
7098 
7099   auto OverflowRes =
7100       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7101   Register Tmp = OverflowRes.getReg(0);
7102   Register Ov = OverflowRes.getReg(1);
7103   MachineInstrBuilder Clamp;
7104   if (IsSigned) {
7105     // sadd.sat(a, b) ->
7106     //   {tmp, ov} = saddo(a, b)
7107     //   ov ? (tmp >>s 31) + 0x80000000 : r
7108     // ssub.sat(a, b) ->
7109     //   {tmp, ov} = ssubo(a, b)
7110     //   ov ? (tmp >>s 31) + 0x80000000 : r
7111     uint64_t NumBits = Ty.getScalarSizeInBits();
7112     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7113     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7114     auto MinVal =
7115         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7116     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7117   } else {
7118     // uadd.sat(a, b) ->
7119     //   {tmp, ov} = uaddo(a, b)
7120     //   ov ? 0xffffffff : tmp
7121     // usub.sat(a, b) ->
7122     //   {tmp, ov} = usubo(a, b)
7123     //   ov ? 0 : tmp
7124     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7125   }
7126   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7127 
7128   MI.eraseFromParent();
7129   return Legalized;
7130 }
7131 
7132 LegalizerHelper::LegalizeResult
7133 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7134   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7135           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7136          "Expected shlsat opcode!");
7137   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7138   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7139   LLT Ty = MRI.getType(Res);
7140   LLT BoolTy = Ty.changeElementSize(1);
7141 
7142   unsigned BW = Ty.getScalarSizeInBits();
7143   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7144   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7145                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7146 
7147   MachineInstrBuilder SatVal;
7148   if (IsSigned) {
7149     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7150     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7151     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7152                                     MIRBuilder.buildConstant(Ty, 0));
7153     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7154   } else {
7155     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7156   }
7157   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7158   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7159 
7160   MI.eraseFromParent();
7161   return Legalized;
7162 }
7163 
7164 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
7165   auto [Dst, Src] = MI.getFirst2Regs();
7166   const LLT Ty = MRI.getType(Src);
7167   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7168   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7169 
7170   // Swap most and least significant byte, set remaining bytes in Res to zero.
7171   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7172   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7173   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7174   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7175 
7176   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7177   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7178     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7179     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7180     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7181     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7182     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7183     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7184     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7185     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7186     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7187     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7188     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7189     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7190   }
7191   Res.getInstr()->getOperand(0).setReg(Dst);
7192 
7193   MI.eraseFromParent();
7194   return Legalized;
7195 }
7196 
7197 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7198 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7199                                  MachineInstrBuilder Src, APInt Mask) {
7200   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7201   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7202   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7203   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7204   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7205   return B.buildOr(Dst, LHS, RHS);
7206 }
7207 
7208 LegalizerHelper::LegalizeResult
7209 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7210   auto [Dst, Src] = MI.getFirst2Regs();
7211   const LLT Ty = MRI.getType(Src);
7212   unsigned Size = Ty.getSizeInBits();
7213 
7214   MachineInstrBuilder BSWAP =
7215       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7216 
7217   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7218   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7219   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7220   MachineInstrBuilder Swap4 =
7221       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7222 
7223   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7224   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7225   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7226   MachineInstrBuilder Swap2 =
7227       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7228 
7229   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7230   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7231   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7232   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7233 
7234   MI.eraseFromParent();
7235   return Legalized;
7236 }
7237 
7238 LegalizerHelper::LegalizeResult
7239 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7240   MachineFunction &MF = MIRBuilder.getMF();
7241 
7242   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7243   int NameOpIdx = IsRead ? 1 : 0;
7244   int ValRegIndex = IsRead ? 0 : 1;
7245 
7246   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7247   const LLT Ty = MRI.getType(ValReg);
7248   const MDString *RegStr = cast<MDString>(
7249     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7250 
7251   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7252   if (!PhysReg.isValid())
7253     return UnableToLegalize;
7254 
7255   if (IsRead)
7256     MIRBuilder.buildCopy(ValReg, PhysReg);
7257   else
7258     MIRBuilder.buildCopy(PhysReg, ValReg);
7259 
7260   MI.eraseFromParent();
7261   return Legalized;
7262 }
7263 
7264 LegalizerHelper::LegalizeResult
7265 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7266   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7267   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7268   Register Result = MI.getOperand(0).getReg();
7269   LLT OrigTy = MRI.getType(Result);
7270   auto SizeInBits = OrigTy.getScalarSizeInBits();
7271   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7272 
7273   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7274   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7275   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7276   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7277 
7278   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7279   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7280   MIRBuilder.buildTrunc(Result, Shifted);
7281 
7282   MI.eraseFromParent();
7283   return Legalized;
7284 }
7285 
7286 LegalizerHelper::LegalizeResult
7287 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
7288   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7289   FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
7290 
7291   if (Mask == fcNone) {
7292     MIRBuilder.buildConstant(DstReg, 0);
7293     MI.eraseFromParent();
7294     return Legalized;
7295   }
7296   if (Mask == fcAllFlags) {
7297     MIRBuilder.buildConstant(DstReg, 1);
7298     MI.eraseFromParent();
7299     return Legalized;
7300   }
7301 
7302   // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7303   // version
7304 
7305   unsigned BitSize = SrcTy.getScalarSizeInBits();
7306   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7307 
7308   LLT IntTy = LLT::scalar(BitSize);
7309   if (SrcTy.isVector())
7310     IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
7311   auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
7312 
7313   // Various masks.
7314   APInt SignBit = APInt::getSignMask(BitSize);
7315   APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
7316   APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
7317   APInt ExpMask = Inf;
7318   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
7319   APInt QNaNBitMask =
7320       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
7321   APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
7322 
7323   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
7324   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
7325   auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
7326   auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
7327   auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
7328 
7329   auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
7330   auto Sign =
7331       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
7332 
7333   auto Res = MIRBuilder.buildConstant(DstTy, 0);
7334   // Clang doesn't support capture of structured bindings:
7335   LLT DstTyCopy = DstTy;
7336   const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
7337     Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
7338   };
7339 
7340   // Tests that involve more than one class should be processed first.
7341   if ((Mask & fcFinite) == fcFinite) {
7342     // finite(V) ==> abs(V) u< exp_mask
7343     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7344                                      ExpMaskC));
7345     Mask &= ~fcFinite;
7346   } else if ((Mask & fcFinite) == fcPosFinite) {
7347     // finite(V) && V > 0 ==> V u< exp_mask
7348     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
7349                                      ExpMaskC));
7350     Mask &= ~fcPosFinite;
7351   } else if ((Mask & fcFinite) == fcNegFinite) {
7352     // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7353     auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7354                                     ExpMaskC);
7355     auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
7356     appendToRes(And);
7357     Mask &= ~fcNegFinite;
7358   }
7359 
7360   if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
7361     // fcZero | fcSubnormal => test all exponent bits are 0
7362     // TODO: Handle sign bit specific cases
7363     // TODO: Handle inverted case
7364     if (PartialCheck == (fcZero | fcSubnormal)) {
7365       auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
7366       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7367                                        ExpBits, ZeroC));
7368       Mask &= ~PartialCheck;
7369     }
7370   }
7371 
7372   // Check for individual classes.
7373   if (FPClassTest PartialCheck = Mask & fcZero) {
7374     if (PartialCheck == fcPosZero)
7375       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7376                                        AsInt, ZeroC));
7377     else if (PartialCheck == fcZero)
7378       appendToRes(
7379           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
7380     else // fcNegZero
7381       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7382                                        AsInt, SignBitC));
7383   }
7384 
7385   if (FPClassTest PartialCheck = Mask & fcSubnormal) {
7386     // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7387     // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7388     auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
7389     auto OneC = MIRBuilder.buildConstant(IntTy, 1);
7390     auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
7391     auto SubnormalRes =
7392         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
7393                              MIRBuilder.buildConstant(IntTy, AllOneMantissa));
7394     if (PartialCheck == fcNegSubnormal)
7395       SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
7396     appendToRes(SubnormalRes);
7397   }
7398 
7399   if (FPClassTest PartialCheck = Mask & fcInf) {
7400     if (PartialCheck == fcPosInf)
7401       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7402                                        AsInt, InfC));
7403     else if (PartialCheck == fcInf)
7404       appendToRes(
7405           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
7406     else { // fcNegInf
7407       APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
7408       auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
7409       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7410                                        AsInt, NegInfC));
7411     }
7412   }
7413 
7414   if (FPClassTest PartialCheck = Mask & fcNan) {
7415     auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
7416     if (PartialCheck == fcNan) {
7417       // isnan(V) ==> abs(V) u> int(inf)
7418       appendToRes(
7419           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
7420     } else if (PartialCheck == fcQNan) {
7421       // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7422       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
7423                                        InfWithQnanBitC));
7424     } else { // fcSNan
7425       // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7426       //                    abs(V) u< (unsigned(Inf) | quiet_bit)
7427       auto IsNan =
7428           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
7429       auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
7430                                             Abs, InfWithQnanBitC);
7431       appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
7432     }
7433   }
7434 
7435   if (FPClassTest PartialCheck = Mask & fcNormal) {
7436     // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7437     // (max_exp-1))
7438     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
7439     auto ExpMinusOne = MIRBuilder.buildSub(
7440         IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
7441     APInt MaxExpMinusOne = ExpMask - ExpLSB;
7442     auto NormalRes =
7443         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
7444                              MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
7445     if (PartialCheck == fcNegNormal)
7446       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
7447     else if (PartialCheck == fcPosNormal) {
7448       auto PosSign = MIRBuilder.buildXor(
7449           DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
7450       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
7451     }
7452     appendToRes(NormalRes);
7453   }
7454 
7455   MIRBuilder.buildCopy(DstReg, Res);
7456   MI.eraseFromParent();
7457   return Legalized;
7458 }
7459 
7460 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7461   // Implement vector G_SELECT in terms of XOR, AND, OR.
7462   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
7463       MI.getFirst4RegLLTs();
7464   if (!DstTy.isVector())
7465     return UnableToLegalize;
7466 
7467   bool IsEltPtr = DstTy.getElementType().isPointer();
7468   if (IsEltPtr) {
7469     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
7470     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
7471     Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
7472     Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
7473     DstTy = NewTy;
7474   }
7475 
7476   if (MaskTy.isScalar()) {
7477     // Turn the scalar condition into a vector condition mask.
7478 
7479     Register MaskElt = MaskReg;
7480 
7481     // The condition was potentially zero extended before, but we want a sign
7482     // extended boolean.
7483     if (MaskTy != LLT::scalar(1))
7484       MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
7485 
7486     // Continue the sign extension (or truncate) to match the data type.
7487     MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
7488                                           MaskElt).getReg(0);
7489 
7490     // Generate a vector splat idiom.
7491     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7492     MaskReg = ShufSplat.getReg(0);
7493     MaskTy = DstTy;
7494   }
7495 
7496   if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
7497     return UnableToLegalize;
7498   }
7499 
7500   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7501   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7502   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7503   if (IsEltPtr) {
7504     auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
7505     MIRBuilder.buildIntToPtr(DstReg, Or);
7506   } else {
7507     MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7508   }
7509   MI.eraseFromParent();
7510   return Legalized;
7511 }
7512 
7513 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
7514   // Split DIVREM into individual instructions.
7515   unsigned Opcode = MI.getOpcode();
7516 
7517   MIRBuilder.buildInstr(
7518       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
7519                                         : TargetOpcode::G_UDIV,
7520       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7521   MIRBuilder.buildInstr(
7522       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
7523                                         : TargetOpcode::G_UREM,
7524       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
7525   MI.eraseFromParent();
7526   return Legalized;
7527 }
7528 
7529 LegalizerHelper::LegalizeResult
7530 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
7531   // Expand %res = G_ABS %a into:
7532   // %v1 = G_ASHR %a, scalar_size-1
7533   // %v2 = G_ADD %a, %v1
7534   // %res = G_XOR %v2, %v1
7535   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
7536   Register OpReg = MI.getOperand(1).getReg();
7537   auto ShiftAmt =
7538       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
7539   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
7540   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
7541   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
7542   MI.eraseFromParent();
7543   return Legalized;
7544 }
7545 
7546 LegalizerHelper::LegalizeResult
7547 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
7548   // Expand %res = G_ABS %a into:
7549   // %v1 = G_CONSTANT 0
7550   // %v2 = G_SUB %v1, %a
7551   // %res = G_SMAX %a, %v2
7552   Register SrcReg = MI.getOperand(1).getReg();
7553   LLT Ty = MRI.getType(SrcReg);
7554   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
7555   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
7556   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
7557   MI.eraseFromParent();
7558   return Legalized;
7559 }
7560 
7561 LegalizerHelper::LegalizeResult
7562 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
7563   Register SrcReg = MI.getOperand(1).getReg();
7564   LLT SrcTy = MRI.getType(SrcReg);
7565   LLT DstTy = MRI.getType(SrcReg);
7566 
7567   // The source could be a scalar if the IR type was <1 x sN>.
7568   if (SrcTy.isScalar()) {
7569     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
7570       return UnableToLegalize; // FIXME: handle extension.
7571     // This can be just a plain copy.
7572     Observer.changingInstr(MI);
7573     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
7574     Observer.changedInstr(MI);
7575     return Legalized;
7576   }
7577   return UnableToLegalize;
7578 }
7579 
7580 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
7581   // On Darwin, -Os means optimize for size without hurting performance, so
7582   // only really optimize for size when -Oz (MinSize) is used.
7583   if (MF.getTarget().getTargetTriple().isOSDarwin())
7584     return MF.getFunction().hasMinSize();
7585   return MF.getFunction().hasOptSize();
7586 }
7587 
7588 // Returns a list of types to use for memory op lowering in MemOps. A partial
7589 // port of findOptimalMemOpLowering in TargetLowering.
7590 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
7591                                           unsigned Limit, const MemOp &Op,
7592                                           unsigned DstAS, unsigned SrcAS,
7593                                           const AttributeList &FuncAttributes,
7594                                           const TargetLowering &TLI) {
7595   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
7596     return false;
7597 
7598   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
7599 
7600   if (Ty == LLT()) {
7601     // Use the largest scalar type whose alignment constraints are satisfied.
7602     // We only need to check DstAlign here as SrcAlign is always greater or
7603     // equal to DstAlign (or zero).
7604     Ty = LLT::scalar(64);
7605     if (Op.isFixedDstAlign())
7606       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
7607              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
7608         Ty = LLT::scalar(Ty.getSizeInBytes());
7609     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
7610     // FIXME: check for the largest legal type we can load/store to.
7611   }
7612 
7613   unsigned NumMemOps = 0;
7614   uint64_t Size = Op.size();
7615   while (Size) {
7616     unsigned TySize = Ty.getSizeInBytes();
7617     while (TySize > Size) {
7618       // For now, only use non-vector load / store's for the left-over pieces.
7619       LLT NewTy = Ty;
7620       // FIXME: check for mem op safety and legality of the types. Not all of
7621       // SDAGisms map cleanly to GISel concepts.
7622       if (NewTy.isVector())
7623         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
7624       NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
7625       unsigned NewTySize = NewTy.getSizeInBytes();
7626       assert(NewTySize > 0 && "Could not find appropriate type");
7627 
7628       // If the new LLT cannot cover all of the remaining bits, then consider
7629       // issuing a (or a pair of) unaligned and overlapping load / store.
7630       unsigned Fast;
7631       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
7632       MVT VT = getMVTForLLT(Ty);
7633       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
7634           TLI.allowsMisalignedMemoryAccesses(
7635               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
7636               MachineMemOperand::MONone, &Fast) &&
7637           Fast)
7638         TySize = Size;
7639       else {
7640         Ty = NewTy;
7641         TySize = NewTySize;
7642       }
7643     }
7644 
7645     if (++NumMemOps > Limit)
7646       return false;
7647 
7648     MemOps.push_back(Ty);
7649     Size -= TySize;
7650   }
7651 
7652   return true;
7653 }
7654 
7655 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
7656   if (Ty.isVector())
7657     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
7658                                 Ty.getNumElements());
7659   return IntegerType::get(C, Ty.getSizeInBits());
7660 }
7661 
7662 // Get a vectorized representation of the memset value operand, GISel edition.
7663 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
7664   MachineRegisterInfo &MRI = *MIB.getMRI();
7665   unsigned NumBits = Ty.getScalarSizeInBits();
7666   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7667   if (!Ty.isVector() && ValVRegAndVal) {
7668     APInt Scalar = ValVRegAndVal->Value.trunc(8);
7669     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
7670     return MIB.buildConstant(Ty, SplatVal).getReg(0);
7671   }
7672 
7673   // Extend the byte value to the larger type, and then multiply by a magic
7674   // value 0x010101... in order to replicate it across every byte.
7675   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
7676   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
7677     return MIB.buildConstant(Ty, 0).getReg(0);
7678   }
7679 
7680   LLT ExtType = Ty.getScalarType();
7681   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
7682   if (NumBits > 8) {
7683     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
7684     auto MagicMI = MIB.buildConstant(ExtType, Magic);
7685     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
7686   }
7687 
7688   // For vector types create a G_BUILD_VECTOR.
7689   if (Ty.isVector())
7690     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
7691 
7692   return Val;
7693 }
7694 
7695 LegalizerHelper::LegalizeResult
7696 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
7697                              uint64_t KnownLen, Align Alignment,
7698                              bool IsVolatile) {
7699   auto &MF = *MI.getParent()->getParent();
7700   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7701   auto &DL = MF.getDataLayout();
7702   LLVMContext &C = MF.getFunction().getContext();
7703 
7704   assert(KnownLen != 0 && "Have a zero length memset length!");
7705 
7706   bool DstAlignCanChange = false;
7707   MachineFrameInfo &MFI = MF.getFrameInfo();
7708   bool OptSize = shouldLowerMemFuncForSize(MF);
7709 
7710   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7711   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7712     DstAlignCanChange = true;
7713 
7714   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
7715   std::vector<LLT> MemOps;
7716 
7717   const auto &DstMMO = **MI.memoperands_begin();
7718   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7719 
7720   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
7721   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
7722 
7723   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
7724                                      MemOp::Set(KnownLen, DstAlignCanChange,
7725                                                 Alignment,
7726                                                 /*IsZeroMemset=*/IsZeroVal,
7727                                                 /*IsVolatile=*/IsVolatile),
7728                                      DstPtrInfo.getAddrSpace(), ~0u,
7729                                      MF.getFunction().getAttributes(), TLI))
7730     return UnableToLegalize;
7731 
7732   if (DstAlignCanChange) {
7733     // Get an estimate of the type from the LLT.
7734     Type *IRTy = getTypeForLLT(MemOps[0], C);
7735     Align NewAlign = DL.getABITypeAlign(IRTy);
7736     if (NewAlign > Alignment) {
7737       Alignment = NewAlign;
7738       unsigned FI = FIDef->getOperand(1).getIndex();
7739       // Give the stack frame object a larger alignment if needed.
7740       if (MFI.getObjectAlign(FI) < Alignment)
7741         MFI.setObjectAlignment(FI, Alignment);
7742     }
7743   }
7744 
7745   MachineIRBuilder MIB(MI);
7746   // Find the largest store and generate the bit pattern for it.
7747   LLT LargestTy = MemOps[0];
7748   for (unsigned i = 1; i < MemOps.size(); i++)
7749     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
7750       LargestTy = MemOps[i];
7751 
7752   // The memset stored value is always defined as an s8, so in order to make it
7753   // work with larger store types we need to repeat the bit pattern across the
7754   // wider type.
7755   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
7756 
7757   if (!MemSetValue)
7758     return UnableToLegalize;
7759 
7760   // Generate the stores. For each store type in the list, we generate the
7761   // matching store of that type to the destination address.
7762   LLT PtrTy = MRI.getType(Dst);
7763   unsigned DstOff = 0;
7764   unsigned Size = KnownLen;
7765   for (unsigned I = 0; I < MemOps.size(); I++) {
7766     LLT Ty = MemOps[I];
7767     unsigned TySize = Ty.getSizeInBytes();
7768     if (TySize > Size) {
7769       // Issuing an unaligned load / store pair that overlaps with the previous
7770       // pair. Adjust the offset accordingly.
7771       assert(I == MemOps.size() - 1 && I != 0);
7772       DstOff -= TySize - Size;
7773     }
7774 
7775     // If this store is smaller than the largest store see whether we can get
7776     // the smaller value for free with a truncate.
7777     Register Value = MemSetValue;
7778     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
7779       MVT VT = getMVTForLLT(Ty);
7780       MVT LargestVT = getMVTForLLT(LargestTy);
7781       if (!LargestTy.isVector() && !Ty.isVector() &&
7782           TLI.isTruncateFree(LargestVT, VT))
7783         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
7784       else
7785         Value = getMemsetValue(Val, Ty, MIB);
7786       if (!Value)
7787         return UnableToLegalize;
7788     }
7789 
7790     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
7791 
7792     Register Ptr = Dst;
7793     if (DstOff != 0) {
7794       auto Offset =
7795           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
7796       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
7797     }
7798 
7799     MIB.buildStore(Value, Ptr, *StoreMMO);
7800     DstOff += Ty.getSizeInBytes();
7801     Size -= TySize;
7802   }
7803 
7804   MI.eraseFromParent();
7805   return Legalized;
7806 }
7807 
7808 LegalizerHelper::LegalizeResult
7809 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
7810   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7811 
7812   auto [Dst, Src, Len] = MI.getFirst3Regs();
7813 
7814   const auto *MMOIt = MI.memoperands_begin();
7815   const MachineMemOperand *MemOp = *MMOIt;
7816   bool IsVolatile = MemOp->isVolatile();
7817 
7818   // See if this is a constant length copy
7819   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
7820   // FIXME: support dynamically sized G_MEMCPY_INLINE
7821   assert(LenVRegAndVal &&
7822          "inline memcpy with dynamic size is not yet supported");
7823   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
7824   if (KnownLen == 0) {
7825     MI.eraseFromParent();
7826     return Legalized;
7827   }
7828 
7829   const auto &DstMMO = **MI.memoperands_begin();
7830   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7831   Align DstAlign = DstMMO.getBaseAlign();
7832   Align SrcAlign = SrcMMO.getBaseAlign();
7833 
7834   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
7835                            IsVolatile);
7836 }
7837 
7838 LegalizerHelper::LegalizeResult
7839 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
7840                                    uint64_t KnownLen, Align DstAlign,
7841                                    Align SrcAlign, bool IsVolatile) {
7842   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
7843   return lowerMemcpy(MI, Dst, Src, KnownLen,
7844                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
7845                      IsVolatile);
7846 }
7847 
7848 LegalizerHelper::LegalizeResult
7849 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
7850                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
7851                              Align SrcAlign, bool IsVolatile) {
7852   auto &MF = *MI.getParent()->getParent();
7853   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7854   auto &DL = MF.getDataLayout();
7855   LLVMContext &C = MF.getFunction().getContext();
7856 
7857   assert(KnownLen != 0 && "Have a zero length memcpy length!");
7858 
7859   bool DstAlignCanChange = false;
7860   MachineFrameInfo &MFI = MF.getFrameInfo();
7861   Align Alignment = std::min(DstAlign, SrcAlign);
7862 
7863   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7864   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7865     DstAlignCanChange = true;
7866 
7867   // FIXME: infer better src pointer alignment like SelectionDAG does here.
7868   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
7869   // if the memcpy is in a tail call position.
7870 
7871   std::vector<LLT> MemOps;
7872 
7873   const auto &DstMMO = **MI.memoperands_begin();
7874   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7875   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7876   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7877 
7878   if (!findGISelOptimalMemOpLowering(
7879           MemOps, Limit,
7880           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7881                       IsVolatile),
7882           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7883           MF.getFunction().getAttributes(), TLI))
7884     return UnableToLegalize;
7885 
7886   if (DstAlignCanChange) {
7887     // Get an estimate of the type from the LLT.
7888     Type *IRTy = getTypeForLLT(MemOps[0], C);
7889     Align NewAlign = DL.getABITypeAlign(IRTy);
7890 
7891     // Don't promote to an alignment that would require dynamic stack
7892     // realignment.
7893     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
7894     if (!TRI->hasStackRealignment(MF))
7895       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
7896         NewAlign = NewAlign.previous();
7897 
7898     if (NewAlign > Alignment) {
7899       Alignment = NewAlign;
7900       unsigned FI = FIDef->getOperand(1).getIndex();
7901       // Give the stack frame object a larger alignment if needed.
7902       if (MFI.getObjectAlign(FI) < Alignment)
7903         MFI.setObjectAlignment(FI, Alignment);
7904     }
7905   }
7906 
7907   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
7908 
7909   MachineIRBuilder MIB(MI);
7910   // Now we need to emit a pair of load and stores for each of the types we've
7911   // collected. I.e. for each type, generate a load from the source pointer of
7912   // that type width, and then generate a corresponding store to the dest buffer
7913   // of that value loaded. This can result in a sequence of loads and stores
7914   // mixed types, depending on what the target specifies as good types to use.
7915   unsigned CurrOffset = 0;
7916   unsigned Size = KnownLen;
7917   for (auto CopyTy : MemOps) {
7918     // Issuing an unaligned load / store pair  that overlaps with the previous
7919     // pair. Adjust the offset accordingly.
7920     if (CopyTy.getSizeInBytes() > Size)
7921       CurrOffset -= CopyTy.getSizeInBytes() - Size;
7922 
7923     // Construct MMOs for the accesses.
7924     auto *LoadMMO =
7925         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
7926     auto *StoreMMO =
7927         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
7928 
7929     // Create the load.
7930     Register LoadPtr = Src;
7931     Register Offset;
7932     if (CurrOffset != 0) {
7933       LLT SrcTy = MRI.getType(Src);
7934       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
7935                    .getReg(0);
7936       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
7937     }
7938     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
7939 
7940     // Create the store.
7941     Register StorePtr = Dst;
7942     if (CurrOffset != 0) {
7943       LLT DstTy = MRI.getType(Dst);
7944       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
7945     }
7946     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
7947     CurrOffset += CopyTy.getSizeInBytes();
7948     Size -= CopyTy.getSizeInBytes();
7949   }
7950 
7951   MI.eraseFromParent();
7952   return Legalized;
7953 }
7954 
7955 LegalizerHelper::LegalizeResult
7956 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
7957                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
7958                               bool IsVolatile) {
7959   auto &MF = *MI.getParent()->getParent();
7960   const auto &TLI = *MF.getSubtarget().getTargetLowering();
7961   auto &DL = MF.getDataLayout();
7962   LLVMContext &C = MF.getFunction().getContext();
7963 
7964   assert(KnownLen != 0 && "Have a zero length memmove length!");
7965 
7966   bool DstAlignCanChange = false;
7967   MachineFrameInfo &MFI = MF.getFrameInfo();
7968   bool OptSize = shouldLowerMemFuncForSize(MF);
7969   Align Alignment = std::min(DstAlign, SrcAlign);
7970 
7971   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
7972   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
7973     DstAlignCanChange = true;
7974 
7975   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
7976   std::vector<LLT> MemOps;
7977 
7978   const auto &DstMMO = **MI.memoperands_begin();
7979   const auto &SrcMMO = **std::next(MI.memoperands_begin());
7980   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
7981   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
7982 
7983   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
7984   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
7985   // same thing here.
7986   if (!findGISelOptimalMemOpLowering(
7987           MemOps, Limit,
7988           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
7989                       /*IsVolatile*/ true),
7990           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
7991           MF.getFunction().getAttributes(), TLI))
7992     return UnableToLegalize;
7993 
7994   if (DstAlignCanChange) {
7995     // Get an estimate of the type from the LLT.
7996     Type *IRTy = getTypeForLLT(MemOps[0], C);
7997     Align NewAlign = DL.getABITypeAlign(IRTy);
7998 
7999     // Don't promote to an alignment that would require dynamic stack
8000     // realignment.
8001     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8002     if (!TRI->hasStackRealignment(MF))
8003       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8004         NewAlign = NewAlign.previous();
8005 
8006     if (NewAlign > Alignment) {
8007       Alignment = NewAlign;
8008       unsigned FI = FIDef->getOperand(1).getIndex();
8009       // Give the stack frame object a larger alignment if needed.
8010       if (MFI.getObjectAlign(FI) < Alignment)
8011         MFI.setObjectAlignment(FI, Alignment);
8012     }
8013   }
8014 
8015   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
8016 
8017   MachineIRBuilder MIB(MI);
8018   // Memmove requires that we perform the loads first before issuing the stores.
8019   // Apart from that, this loop is pretty much doing the same thing as the
8020   // memcpy codegen function.
8021   unsigned CurrOffset = 0;
8022   SmallVector<Register, 16> LoadVals;
8023   for (auto CopyTy : MemOps) {
8024     // Construct MMO for the load.
8025     auto *LoadMMO =
8026         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8027 
8028     // Create the load.
8029     Register LoadPtr = Src;
8030     if (CurrOffset != 0) {
8031       LLT SrcTy = MRI.getType(Src);
8032       auto Offset =
8033           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
8034       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8035     }
8036     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
8037     CurrOffset += CopyTy.getSizeInBytes();
8038   }
8039 
8040   CurrOffset = 0;
8041   for (unsigned I = 0; I < MemOps.size(); ++I) {
8042     LLT CopyTy = MemOps[I];
8043     // Now store the values loaded.
8044     auto *StoreMMO =
8045         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8046 
8047     Register StorePtr = Dst;
8048     if (CurrOffset != 0) {
8049       LLT DstTy = MRI.getType(Dst);
8050       auto Offset =
8051           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
8052       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8053     }
8054     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
8055     CurrOffset += CopyTy.getSizeInBytes();
8056   }
8057   MI.eraseFromParent();
8058   return Legalized;
8059 }
8060 
8061 LegalizerHelper::LegalizeResult
8062 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
8063   const unsigned Opc = MI.getOpcode();
8064   // This combine is fairly complex so it's not written with a separate
8065   // matcher function.
8066   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
8067           Opc == TargetOpcode::G_MEMSET) &&
8068          "Expected memcpy like instruction");
8069 
8070   auto MMOIt = MI.memoperands_begin();
8071   const MachineMemOperand *MemOp = *MMOIt;
8072 
8073   Align DstAlign = MemOp->getBaseAlign();
8074   Align SrcAlign;
8075   auto [Dst, Src, Len] = MI.getFirst3Regs();
8076 
8077   if (Opc != TargetOpcode::G_MEMSET) {
8078     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
8079     MemOp = *(++MMOIt);
8080     SrcAlign = MemOp->getBaseAlign();
8081   }
8082 
8083   // See if this is a constant length copy
8084   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8085   if (!LenVRegAndVal)
8086     return UnableToLegalize;
8087   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8088 
8089   if (KnownLen == 0) {
8090     MI.eraseFromParent();
8091     return Legalized;
8092   }
8093 
8094   bool IsVolatile = MemOp->isVolatile();
8095   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
8096     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8097                              IsVolatile);
8098 
8099   // Don't try to optimize volatile.
8100   if (IsVolatile)
8101     return UnableToLegalize;
8102 
8103   if (MaxLen && KnownLen > MaxLen)
8104     return UnableToLegalize;
8105 
8106   if (Opc == TargetOpcode::G_MEMCPY) {
8107     auto &MF = *MI.getParent()->getParent();
8108     const auto &TLI = *MF.getSubtarget().getTargetLowering();
8109     bool OptSize = shouldLowerMemFuncForSize(MF);
8110     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
8111     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
8112                        IsVolatile);
8113   }
8114   if (Opc == TargetOpcode::G_MEMMOVE)
8115     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
8116   if (Opc == TargetOpcode::G_MEMSET)
8117     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
8118   return UnableToLegalize;
8119 }
8120