xref: /freebsd/contrib/llvm-project/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelValueTracking.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/LowLevelTypeUtils.h"
26 #include "llvm/CodeGen/MachineConstantPool.h"
27 #include "llvm/CodeGen/MachineFrameInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/RuntimeLibcallUtil.h"
30 #include "llvm/CodeGen/TargetFrameLowering.h"
31 #include "llvm/CodeGen/TargetInstrInfo.h"
32 #include "llvm/CodeGen/TargetLowering.h"
33 #include "llvm/CodeGen/TargetOpcodes.h"
34 #include "llvm/CodeGen/TargetSubtargetInfo.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/Support/Debug.h"
37 #include "llvm/Support/MathExtras.h"
38 #include "llvm/Support/raw_ostream.h"
39 #include "llvm/Target/TargetMachine.h"
40 #include <numeric>
41 #include <optional>
42 
43 #define DEBUG_TYPE "legalizer"
44 
45 using namespace llvm;
46 using namespace LegalizeActions;
47 using namespace MIPatternMatch;
48 
49 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
50 ///
51 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
52 /// with any leftover piece as type \p LeftoverTy
53 ///
54 /// Returns -1 in the first element of the pair if the breakdown is not
55 /// satisfiable.
56 static std::pair<int, int>
57 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
58   assert(!LeftoverTy.isValid() && "this is an out argument");
59 
60   unsigned Size = OrigTy.getSizeInBits();
61   unsigned NarrowSize = NarrowTy.getSizeInBits();
62   unsigned NumParts = Size / NarrowSize;
63   unsigned LeftoverSize = Size - NumParts * NarrowSize;
64   assert(Size > NarrowSize);
65 
66   if (LeftoverSize == 0)
67     return {NumParts, 0};
68 
69   if (NarrowTy.isVector()) {
70     unsigned EltSize = OrigTy.getScalarSizeInBits();
71     if (LeftoverSize % EltSize != 0)
72       return {-1, -1};
73     LeftoverTy =
74         LLT::scalarOrVector(ElementCount::getFixed(LeftoverSize / EltSize),
75                             OrigTy.getElementType());
76   } else {
77     LeftoverTy = LLT::scalar(LeftoverSize);
78   }
79 
80   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
81   return std::make_pair(NumParts, NumLeftover);
82 }
83 
84 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
85 
86   if (!Ty.isScalar())
87     return nullptr;
88 
89   switch (Ty.getSizeInBits()) {
90   case 16:
91     return Type::getHalfTy(Ctx);
92   case 32:
93     return Type::getFloatTy(Ctx);
94   case 64:
95     return Type::getDoubleTy(Ctx);
96   case 80:
97     return Type::getX86_FP80Ty(Ctx);
98   case 128:
99     return Type::getFP128Ty(Ctx);
100   default:
101     return nullptr;
102   }
103 }
104 
105 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
106                                  GISelChangeObserver &Observer,
107                                  MachineIRBuilder &Builder)
108     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
109       LI(*MF.getSubtarget().getLegalizerInfo()),
110       TLI(*MF.getSubtarget().getTargetLowering()), VT(nullptr) {}
111 
112 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
113                                  GISelChangeObserver &Observer,
114                                  MachineIRBuilder &B, GISelValueTracking *VT)
115     : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
116       TLI(*MF.getSubtarget().getTargetLowering()), VT(VT) {}
117 
118 LegalizerHelper::LegalizeResult
119 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
120                                    LostDebugLocObserver &LocObserver) {
121   LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
122 
123   MIRBuilder.setInstrAndDebugLoc(MI);
124 
125   if (isa<GIntrinsic>(MI))
126     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
127   auto Step = LI.getAction(MI, MRI);
128   switch (Step.Action) {
129   case Legal:
130     LLVM_DEBUG(dbgs() << ".. Already legal\n");
131     return AlreadyLegal;
132   case Libcall:
133     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
134     return libcall(MI, LocObserver);
135   case NarrowScalar:
136     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
137     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
138   case WidenScalar:
139     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
140     return widenScalar(MI, Step.TypeIdx, Step.NewType);
141   case Bitcast:
142     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
143     return bitcast(MI, Step.TypeIdx, Step.NewType);
144   case Lower:
145     LLVM_DEBUG(dbgs() << ".. Lower\n");
146     return lower(MI, Step.TypeIdx, Step.NewType);
147   case FewerElements:
148     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
149     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
150   case MoreElements:
151     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
152     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
153   case Custom:
154     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
155     return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
156                                                      : UnableToLegalize;
157   default:
158     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
159     return UnableToLegalize;
160   }
161 }
162 
163 void LegalizerHelper::insertParts(Register DstReg,
164                                   LLT ResultTy, LLT PartTy,
165                                   ArrayRef<Register> PartRegs,
166                                   LLT LeftoverTy,
167                                   ArrayRef<Register> LeftoverRegs) {
168   if (!LeftoverTy.isValid()) {
169     assert(LeftoverRegs.empty());
170 
171     if (!ResultTy.isVector()) {
172       MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
173       return;
174     }
175 
176     if (PartTy.isVector())
177       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
178     else
179       MIRBuilder.buildBuildVector(DstReg, PartRegs);
180     return;
181   }
182 
183   // Merge sub-vectors with different number of elements and insert into DstReg.
184   if (ResultTy.isVector()) {
185     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
186     SmallVector<Register, 8> AllRegs(PartRegs);
187     AllRegs.append(LeftoverRegs.begin(), LeftoverRegs.end());
188     return mergeMixedSubvectors(DstReg, AllRegs);
189   }
190 
191   SmallVector<Register> GCDRegs;
192   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
193   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
194     extractGCDType(GCDRegs, GCDTy, PartReg);
195   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
196   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
197 }
198 
199 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
200                                        Register Reg) {
201   LLT Ty = MRI.getType(Reg);
202   SmallVector<Register, 8> RegElts;
203   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
204                MIRBuilder, MRI);
205   Elts.append(RegElts);
206 }
207 
208 /// Merge \p PartRegs with different types into \p DstReg.
209 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
210                                            ArrayRef<Register> PartRegs) {
211   SmallVector<Register, 8> AllElts;
212   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
213     appendVectorElts(AllElts, PartRegs[i]);
214 
215   Register Leftover = PartRegs[PartRegs.size() - 1];
216   if (!MRI.getType(Leftover).isVector())
217     AllElts.push_back(Leftover);
218   else
219     appendVectorElts(AllElts, Leftover);
220 
221   MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
222 }
223 
224 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
225 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
226                               const MachineInstr &MI) {
227   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
228 
229   const int StartIdx = Regs.size();
230   const int NumResults = MI.getNumOperands() - 1;
231   Regs.resize(Regs.size() + NumResults);
232   for (int I = 0; I != NumResults; ++I)
233     Regs[StartIdx + I] = MI.getOperand(I).getReg();
234 }
235 
236 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
237                                      LLT GCDTy, Register SrcReg) {
238   LLT SrcTy = MRI.getType(SrcReg);
239   if (SrcTy == GCDTy) {
240     // If the source already evenly divides the result type, we don't need to do
241     // anything.
242     Parts.push_back(SrcReg);
243   } else {
244     // Need to split into common type sized pieces.
245     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
246     getUnmergeResults(Parts, *Unmerge);
247   }
248 }
249 
250 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
251                                     LLT NarrowTy, Register SrcReg) {
252   LLT SrcTy = MRI.getType(SrcReg);
253   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
254   extractGCDType(Parts, GCDTy, SrcReg);
255   return GCDTy;
256 }
257 
258 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
259                                          SmallVectorImpl<Register> &VRegs,
260                                          unsigned PadStrategy) {
261   LLT LCMTy = getLCMType(DstTy, NarrowTy);
262 
263   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
264   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
265   int NumOrigSrc = VRegs.size();
266 
267   Register PadReg;
268 
269   // Get a value we can use to pad the source value if the sources won't evenly
270   // cover the result type.
271   if (NumOrigSrc < NumParts * NumSubParts) {
272     if (PadStrategy == TargetOpcode::G_ZEXT)
273       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
274     else if (PadStrategy == TargetOpcode::G_ANYEXT)
275       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
276     else {
277       assert(PadStrategy == TargetOpcode::G_SEXT);
278 
279       // Shift the sign bit of the low register through the high register.
280       auto ShiftAmt =
281         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
282       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
283     }
284   }
285 
286   // Registers for the final merge to be produced.
287   SmallVector<Register, 4> Remerge(NumParts);
288 
289   // Registers needed for intermediate merges, which will be merged into a
290   // source for Remerge.
291   SmallVector<Register, 4> SubMerge(NumSubParts);
292 
293   // Once we've fully read off the end of the original source bits, we can reuse
294   // the same high bits for remaining padding elements.
295   Register AllPadReg;
296 
297   // Build merges to the LCM type to cover the original result type.
298   for (int I = 0; I != NumParts; ++I) {
299     bool AllMergePartsArePadding = true;
300 
301     // Build the requested merges to the requested type.
302     for (int J = 0; J != NumSubParts; ++J) {
303       int Idx = I * NumSubParts + J;
304       if (Idx >= NumOrigSrc) {
305         SubMerge[J] = PadReg;
306         continue;
307       }
308 
309       SubMerge[J] = VRegs[Idx];
310 
311       // There are meaningful bits here we can't reuse later.
312       AllMergePartsArePadding = false;
313     }
314 
315     // If we've filled up a complete piece with padding bits, we can directly
316     // emit the natural sized constant if applicable, rather than a merge of
317     // smaller constants.
318     if (AllMergePartsArePadding && !AllPadReg) {
319       if (PadStrategy == TargetOpcode::G_ANYEXT)
320         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
321       else if (PadStrategy == TargetOpcode::G_ZEXT)
322         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
323 
324       // If this is a sign extension, we can't materialize a trivial constant
325       // with the right type and have to produce a merge.
326     }
327 
328     if (AllPadReg) {
329       // Avoid creating additional instructions if we're just adding additional
330       // copies of padding bits.
331       Remerge[I] = AllPadReg;
332       continue;
333     }
334 
335     if (NumSubParts == 1)
336       Remerge[I] = SubMerge[0];
337     else
338       Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
339 
340     // In the sign extend padding case, re-use the first all-signbit merge.
341     if (AllMergePartsArePadding && !AllPadReg)
342       AllPadReg = Remerge[I];
343   }
344 
345   VRegs = std::move(Remerge);
346   return LCMTy;
347 }
348 
349 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
350                                                ArrayRef<Register> RemergeRegs) {
351   LLT DstTy = MRI.getType(DstReg);
352 
353   // Create the merge to the widened source, and extract the relevant bits into
354   // the result.
355 
356   if (DstTy == LCMTy) {
357     MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
358     return;
359   }
360 
361   auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
362   if (DstTy.isScalar() && LCMTy.isScalar()) {
363     MIRBuilder.buildTrunc(DstReg, Remerge);
364     return;
365   }
366 
367   if (LCMTy.isVector()) {
368     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
369     SmallVector<Register, 8> UnmergeDefs(NumDefs);
370     UnmergeDefs[0] = DstReg;
371     for (unsigned I = 1; I != NumDefs; ++I)
372       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
373 
374     MIRBuilder.buildUnmerge(UnmergeDefs,
375                             MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
376     return;
377   }
378 
379   llvm_unreachable("unhandled case");
380 }
381 
382 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
383 #define RTLIBCASE_INT(LibcallPrefix)                                           \
384   do {                                                                         \
385     switch (Size) {                                                            \
386     case 32:                                                                   \
387       return RTLIB::LibcallPrefix##32;                                         \
388     case 64:                                                                   \
389       return RTLIB::LibcallPrefix##64;                                         \
390     case 128:                                                                  \
391       return RTLIB::LibcallPrefix##128;                                        \
392     default:                                                                   \
393       llvm_unreachable("unexpected size");                                     \
394     }                                                                          \
395   } while (0)
396 
397 #define RTLIBCASE(LibcallPrefix)                                               \
398   do {                                                                         \
399     switch (Size) {                                                            \
400     case 32:                                                                   \
401       return RTLIB::LibcallPrefix##32;                                         \
402     case 64:                                                                   \
403       return RTLIB::LibcallPrefix##64;                                         \
404     case 80:                                                                   \
405       return RTLIB::LibcallPrefix##80;                                         \
406     case 128:                                                                  \
407       return RTLIB::LibcallPrefix##128;                                        \
408     default:                                                                   \
409       llvm_unreachable("unexpected size");                                     \
410     }                                                                          \
411   } while (0)
412 
413   switch (Opcode) {
414   case TargetOpcode::G_LROUND:
415     RTLIBCASE(LROUND_F);
416   case TargetOpcode::G_LLROUND:
417     RTLIBCASE(LLROUND_F);
418   case TargetOpcode::G_MUL:
419     RTLIBCASE_INT(MUL_I);
420   case TargetOpcode::G_SDIV:
421     RTLIBCASE_INT(SDIV_I);
422   case TargetOpcode::G_UDIV:
423     RTLIBCASE_INT(UDIV_I);
424   case TargetOpcode::G_SREM:
425     RTLIBCASE_INT(SREM_I);
426   case TargetOpcode::G_UREM:
427     RTLIBCASE_INT(UREM_I);
428   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
429     RTLIBCASE_INT(CTLZ_I);
430   case TargetOpcode::G_FADD:
431     RTLIBCASE(ADD_F);
432   case TargetOpcode::G_FSUB:
433     RTLIBCASE(SUB_F);
434   case TargetOpcode::G_FMUL:
435     RTLIBCASE(MUL_F);
436   case TargetOpcode::G_FDIV:
437     RTLIBCASE(DIV_F);
438   case TargetOpcode::G_FEXP:
439     RTLIBCASE(EXP_F);
440   case TargetOpcode::G_FEXP2:
441     RTLIBCASE(EXP2_F);
442   case TargetOpcode::G_FEXP10:
443     RTLIBCASE(EXP10_F);
444   case TargetOpcode::G_FREM:
445     RTLIBCASE(REM_F);
446   case TargetOpcode::G_FPOW:
447     RTLIBCASE(POW_F);
448   case TargetOpcode::G_FPOWI:
449     RTLIBCASE(POWI_F);
450   case TargetOpcode::G_FMA:
451     RTLIBCASE(FMA_F);
452   case TargetOpcode::G_FSIN:
453     RTLIBCASE(SIN_F);
454   case TargetOpcode::G_FCOS:
455     RTLIBCASE(COS_F);
456   case TargetOpcode::G_FTAN:
457     RTLIBCASE(TAN_F);
458   case TargetOpcode::G_FASIN:
459     RTLIBCASE(ASIN_F);
460   case TargetOpcode::G_FACOS:
461     RTLIBCASE(ACOS_F);
462   case TargetOpcode::G_FATAN:
463     RTLIBCASE(ATAN_F);
464   case TargetOpcode::G_FATAN2:
465     RTLIBCASE(ATAN2_F);
466   case TargetOpcode::G_FSINH:
467     RTLIBCASE(SINH_F);
468   case TargetOpcode::G_FCOSH:
469     RTLIBCASE(COSH_F);
470   case TargetOpcode::G_FTANH:
471     RTLIBCASE(TANH_F);
472   case TargetOpcode::G_FSINCOS:
473     RTLIBCASE(SINCOS_F);
474   case TargetOpcode::G_FLOG10:
475     RTLIBCASE(LOG10_F);
476   case TargetOpcode::G_FLOG:
477     RTLIBCASE(LOG_F);
478   case TargetOpcode::G_FLOG2:
479     RTLIBCASE(LOG2_F);
480   case TargetOpcode::G_FLDEXP:
481     RTLIBCASE(LDEXP_F);
482   case TargetOpcode::G_FCEIL:
483     RTLIBCASE(CEIL_F);
484   case TargetOpcode::G_FFLOOR:
485     RTLIBCASE(FLOOR_F);
486   case TargetOpcode::G_FMINNUM:
487     RTLIBCASE(FMIN_F);
488   case TargetOpcode::G_FMAXNUM:
489     RTLIBCASE(FMAX_F);
490   case TargetOpcode::G_FSQRT:
491     RTLIBCASE(SQRT_F);
492   case TargetOpcode::G_FRINT:
493     RTLIBCASE(RINT_F);
494   case TargetOpcode::G_FNEARBYINT:
495     RTLIBCASE(NEARBYINT_F);
496   case TargetOpcode::G_INTRINSIC_TRUNC:
497     RTLIBCASE(TRUNC_F);
498   case TargetOpcode::G_INTRINSIC_ROUND:
499     RTLIBCASE(ROUND_F);
500   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
501     RTLIBCASE(ROUNDEVEN_F);
502   case TargetOpcode::G_INTRINSIC_LRINT:
503     RTLIBCASE(LRINT_F);
504   case TargetOpcode::G_INTRINSIC_LLRINT:
505     RTLIBCASE(LLRINT_F);
506   }
507   llvm_unreachable("Unknown libcall function");
508 #undef RTLIBCASE_INT
509 #undef RTLIBCASE
510 }
511 
512 /// True if an instruction is in tail position in its caller. Intended for
513 /// legalizing libcalls as tail calls when possible.
514 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
515                                     MachineInstr &MI,
516                                     const TargetInstrInfo &TII,
517                                     MachineRegisterInfo &MRI) {
518   MachineBasicBlock &MBB = *MI.getParent();
519   const Function &F = MBB.getParent()->getFunction();
520 
521   // Conservatively require the attributes of the call to match those of
522   // the return. Ignore NoAlias and NonNull because they don't affect the
523   // call sequence.
524   AttributeList CallerAttrs = F.getAttributes();
525   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
526           .removeAttribute(Attribute::NoAlias)
527           .removeAttribute(Attribute::NonNull)
528           .hasAttributes())
529     return false;
530 
531   // It's not safe to eliminate the sign / zero extension of the return value.
532   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
533       CallerAttrs.hasRetAttr(Attribute::SExt))
534     return false;
535 
536   // Only tail call if the following instruction is a standard return or if we
537   // have a `thisreturn` callee, and a sequence like:
538   //
539   //   G_MEMCPY %0, %1, %2
540   //   $x0 = COPY %0
541   //   RET_ReallyLR implicit $x0
542   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
543   if (Next != MBB.instr_end() && Next->isCopy()) {
544     if (MI.getOpcode() == TargetOpcode::G_BZERO)
545       return false;
546 
547     // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
548     // mempy/etc routines return the same parameter. For other it will be the
549     // returned value.
550     Register VReg = MI.getOperand(0).getReg();
551     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
552       return false;
553 
554     Register PReg = Next->getOperand(0).getReg();
555     if (!PReg.isPhysical())
556       return false;
557 
558     auto Ret = next_nodbg(Next, MBB.instr_end());
559     if (Ret == MBB.instr_end() || !Ret->isReturn())
560       return false;
561 
562     if (Ret->getNumImplicitOperands() != 1)
563       return false;
564 
565     if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
566       return false;
567 
568     // Skip over the COPY that we just validated.
569     Next = Ret;
570   }
571 
572   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
573     return false;
574 
575   return true;
576 }
577 
578 LegalizerHelper::LegalizeResult
579 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
580                     const CallLowering::ArgInfo &Result,
581                     ArrayRef<CallLowering::ArgInfo> Args,
582                     const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
583                     MachineInstr *MI) {
584   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
585 
586   CallLowering::CallLoweringInfo Info;
587   Info.CallConv = CC;
588   Info.Callee = MachineOperand::CreateES(Name);
589   Info.OrigRet = Result;
590   if (MI)
591     Info.IsTailCall =
592         (Result.Ty->isVoidTy() ||
593          Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
594         isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
595                                 *MIRBuilder.getMRI());
596 
597   llvm::append_range(Info.OrigArgs, Args);
598   if (!CLI.lowerCall(MIRBuilder, Info))
599     return LegalizerHelper::UnableToLegalize;
600 
601   if (MI && Info.LoweredTailCall) {
602     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
603 
604     // Check debug locations before removing the return.
605     LocObserver.checkpoint(true);
606 
607     // We must have a return following the call (or debug insts) to get past
608     // isLibCallInTailPosition.
609     do {
610       MachineInstr *Next = MI->getNextNode();
611       assert(Next &&
612              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
613              "Expected instr following MI to be return or debug inst?");
614       // We lowered a tail call, so the call is now the return from the block.
615       // Delete the old return.
616       Next->eraseFromParent();
617     } while (MI->getNextNode());
618 
619     // We expect to lose the debug location from the return.
620     LocObserver.checkpoint(false);
621   }
622   return LegalizerHelper::Legalized;
623 }
624 
625 LegalizerHelper::LegalizeResult
626 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
627                     const CallLowering::ArgInfo &Result,
628                     ArrayRef<CallLowering::ArgInfo> Args,
629                     LostDebugLocObserver &LocObserver, MachineInstr *MI) {
630   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
631   const char *Name = TLI.getLibcallName(Libcall);
632   if (!Name)
633     return LegalizerHelper::UnableToLegalize;
634   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
635   return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
636 }
637 
638 // Useful for libcalls where all operands have the same type.
639 static LegalizerHelper::LegalizeResult
640 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
641               Type *OpType, LostDebugLocObserver &LocObserver) {
642   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
643 
644   // FIXME: What does the original arg index mean here?
645   SmallVector<CallLowering::ArgInfo, 3> Args;
646   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
647     Args.push_back({MO.getReg(), OpType, 0});
648   return createLibcall(MIRBuilder, Libcall,
649                        {MI.getOperand(0).getReg(), OpType, 0}, Args,
650                        LocObserver, &MI);
651 }
652 
653 LegalizerHelper::LegalizeResult LegalizerHelper::emitSincosLibcall(
654     MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size, Type *OpType,
655     LostDebugLocObserver &LocObserver) {
656   MachineFunction &MF = *MI.getMF();
657   MachineRegisterInfo &MRI = MF.getRegInfo();
658 
659   Register DstSin = MI.getOperand(0).getReg();
660   Register DstCos = MI.getOperand(1).getReg();
661   Register Src = MI.getOperand(2).getReg();
662   LLT DstTy = MRI.getType(DstSin);
663 
664   int MemSize = DstTy.getSizeInBytes();
665   Align Alignment = getStackTemporaryAlignment(DstTy);
666   const DataLayout &DL = MIRBuilder.getDataLayout();
667   unsigned AddrSpace = DL.getAllocaAddrSpace();
668   MachinePointerInfo PtrInfo;
669 
670   Register StackPtrSin =
671       createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
672           .getReg(0);
673   Register StackPtrCos =
674       createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo)
675           .getReg(0);
676 
677   auto &Ctx = MF.getFunction().getContext();
678   auto LibcallResult =
679       createLibcall(MIRBuilder, getRTLibDesc(MI.getOpcode(), Size),
680                     {{0}, Type::getVoidTy(Ctx), 0},
681                     {{Src, OpType, 0},
682                      {StackPtrSin, PointerType::get(Ctx, AddrSpace), 1},
683                      {StackPtrCos, PointerType::get(Ctx, AddrSpace), 2}},
684                     LocObserver, &MI);
685 
686   if (LibcallResult != LegalizeResult::Legalized)
687     return LegalizerHelper::UnableToLegalize;
688 
689   MachineMemOperand *LoadMMOSin = MF.getMachineMemOperand(
690       PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
691   MachineMemOperand *LoadMMOCos = MF.getMachineMemOperand(
692       PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment);
693 
694   MIRBuilder.buildLoad(DstSin, StackPtrSin, *LoadMMOSin);
695   MIRBuilder.buildLoad(DstCos, StackPtrCos, *LoadMMOCos);
696   MI.eraseFromParent();
697 
698   return LegalizerHelper::Legalized;
699 }
700 
701 LegalizerHelper::LegalizeResult
702 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
703                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
704   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
705 
706   SmallVector<CallLowering::ArgInfo, 3> Args;
707   // Add all the args, except for the last which is an imm denoting 'tail'.
708   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
709     Register Reg = MI.getOperand(i).getReg();
710 
711     // Need derive an IR type for call lowering.
712     LLT OpLLT = MRI.getType(Reg);
713     Type *OpTy = nullptr;
714     if (OpLLT.isPointer())
715       OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
716     else
717       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
718     Args.push_back({Reg, OpTy, 0});
719   }
720 
721   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
722   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
723   RTLIB::Libcall RTLibcall;
724   unsigned Opc = MI.getOpcode();
725   const char *Name;
726   switch (Opc) {
727   case TargetOpcode::G_BZERO:
728     RTLibcall = RTLIB::BZERO;
729     Name = TLI.getLibcallName(RTLibcall);
730     break;
731   case TargetOpcode::G_MEMCPY:
732     RTLibcall = RTLIB::MEMCPY;
733     Name = TLI.getMemcpyName();
734     Args[0].Flags[0].setReturned();
735     break;
736   case TargetOpcode::G_MEMMOVE:
737     RTLibcall = RTLIB::MEMMOVE;
738     Name = TLI.getLibcallName(RTLibcall);
739     Args[0].Flags[0].setReturned();
740     break;
741   case TargetOpcode::G_MEMSET:
742     RTLibcall = RTLIB::MEMSET;
743     Name = TLI.getLibcallName(RTLibcall);
744     Args[0].Flags[0].setReturned();
745     break;
746   default:
747     llvm_unreachable("unsupported opcode");
748   }
749 
750   // Unsupported libcall on the target.
751   if (!Name) {
752     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
753                       << MIRBuilder.getTII().getName(Opc) << "\n");
754     return LegalizerHelper::UnableToLegalize;
755   }
756 
757   CallLowering::CallLoweringInfo Info;
758   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
759   Info.Callee = MachineOperand::CreateES(Name);
760   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
761   Info.IsTailCall =
762       MI.getOperand(MI.getNumOperands() - 1).getImm() &&
763       isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
764 
765   llvm::append_range(Info.OrigArgs, Args);
766   if (!CLI.lowerCall(MIRBuilder, Info))
767     return LegalizerHelper::UnableToLegalize;
768 
769   if (Info.LoweredTailCall) {
770     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
771 
772     // Check debug locations before removing the return.
773     LocObserver.checkpoint(true);
774 
775     // We must have a return following the call (or debug insts) to get past
776     // isLibCallInTailPosition.
777     do {
778       MachineInstr *Next = MI.getNextNode();
779       assert(Next &&
780              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
781              "Expected instr following MI to be return or debug inst?");
782       // We lowered a tail call, so the call is now the return from the block.
783       // Delete the old return.
784       Next->eraseFromParent();
785     } while (MI.getNextNode());
786 
787     // We expect to lose the debug location from the return.
788     LocObserver.checkpoint(false);
789   }
790 
791   return LegalizerHelper::Legalized;
792 }
793 
794 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
795   unsigned Opc = MI.getOpcode();
796   auto &AtomicMI = cast<GMemOperation>(MI);
797   auto &MMO = AtomicMI.getMMO();
798   auto Ordering = MMO.getMergedOrdering();
799   LLT MemType = MMO.getMemoryType();
800   uint64_t MemSize = MemType.getSizeInBytes();
801   if (MemType.isVector())
802     return RTLIB::UNKNOWN_LIBCALL;
803 
804 #define LCALLS(A, B) {A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL}
805 #define LCALL5(A)                                                              \
806   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
807   switch (Opc) {
808   case TargetOpcode::G_ATOMIC_CMPXCHG:
809   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
810     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
811     return getOutlineAtomicHelper(LC, Ordering, MemSize);
812   }
813   case TargetOpcode::G_ATOMICRMW_XCHG: {
814     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
815     return getOutlineAtomicHelper(LC, Ordering, MemSize);
816   }
817   case TargetOpcode::G_ATOMICRMW_ADD:
818   case TargetOpcode::G_ATOMICRMW_SUB: {
819     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
820     return getOutlineAtomicHelper(LC, Ordering, MemSize);
821   }
822   case TargetOpcode::G_ATOMICRMW_AND: {
823     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
824     return getOutlineAtomicHelper(LC, Ordering, MemSize);
825   }
826   case TargetOpcode::G_ATOMICRMW_OR: {
827     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
828     return getOutlineAtomicHelper(LC, Ordering, MemSize);
829   }
830   case TargetOpcode::G_ATOMICRMW_XOR: {
831     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
832     return getOutlineAtomicHelper(LC, Ordering, MemSize);
833   }
834   default:
835     return RTLIB::UNKNOWN_LIBCALL;
836   }
837 #undef LCALLS
838 #undef LCALL5
839 }
840 
841 static LegalizerHelper::LegalizeResult
842 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
843   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
844 
845   Type *RetTy;
846   SmallVector<Register> RetRegs;
847   SmallVector<CallLowering::ArgInfo, 3> Args;
848   unsigned Opc = MI.getOpcode();
849   switch (Opc) {
850   case TargetOpcode::G_ATOMIC_CMPXCHG:
851   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
852     Register Success;
853     LLT SuccessLLT;
854     auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
855         MI.getFirst4RegLLTs();
856     RetRegs.push_back(Ret);
857     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
858     if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
859       std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
860                NewLLT) = MI.getFirst5RegLLTs();
861       RetRegs.push_back(Success);
862       RetTy = StructType::get(
863           Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
864     }
865     Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
866     Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
867     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
868     break;
869   }
870   case TargetOpcode::G_ATOMICRMW_XCHG:
871   case TargetOpcode::G_ATOMICRMW_ADD:
872   case TargetOpcode::G_ATOMICRMW_SUB:
873   case TargetOpcode::G_ATOMICRMW_AND:
874   case TargetOpcode::G_ATOMICRMW_OR:
875   case TargetOpcode::G_ATOMICRMW_XOR: {
876     auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
877     RetRegs.push_back(Ret);
878     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
879     if (Opc == TargetOpcode::G_ATOMICRMW_AND)
880       Val =
881           MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
882               .getReg(0);
883     else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
884       Val =
885           MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
886               .getReg(0);
887     Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
888     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
889     break;
890   }
891   default:
892     llvm_unreachable("unsupported opcode");
893   }
894 
895   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
896   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
897   RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
898   const char *Name = TLI.getLibcallName(RTLibcall);
899 
900   // Unsupported libcall on the target.
901   if (!Name) {
902     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
903                       << MIRBuilder.getTII().getName(Opc) << "\n");
904     return LegalizerHelper::UnableToLegalize;
905   }
906 
907   CallLowering::CallLoweringInfo Info;
908   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
909   Info.Callee = MachineOperand::CreateES(Name);
910   Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
911 
912   llvm::append_range(Info.OrigArgs, Args);
913   if (!CLI.lowerCall(MIRBuilder, Info))
914     return LegalizerHelper::UnableToLegalize;
915 
916   return LegalizerHelper::Legalized;
917 }
918 
919 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
920                                        Type *FromType) {
921   auto ToMVT = MVT::getVT(ToType);
922   auto FromMVT = MVT::getVT(FromType);
923 
924   switch (Opcode) {
925   case TargetOpcode::G_FPEXT:
926     return RTLIB::getFPEXT(FromMVT, ToMVT);
927   case TargetOpcode::G_FPTRUNC:
928     return RTLIB::getFPROUND(FromMVT, ToMVT);
929   case TargetOpcode::G_FPTOSI:
930     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
931   case TargetOpcode::G_FPTOUI:
932     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
933   case TargetOpcode::G_SITOFP:
934     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
935   case TargetOpcode::G_UITOFP:
936     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
937   }
938   llvm_unreachable("Unsupported libcall function");
939 }
940 
941 static LegalizerHelper::LegalizeResult
942 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
943                   Type *FromType, LostDebugLocObserver &LocObserver,
944                   const TargetLowering &TLI, bool IsSigned = false) {
945   CallLowering::ArgInfo Arg = {MI.getOperand(1).getReg(), FromType, 0};
946   if (FromType->isIntegerTy()) {
947     if (TLI.shouldSignExtendTypeInLibCall(FromType, IsSigned))
948       Arg.Flags[0].setSExt();
949     else
950       Arg.Flags[0].setZExt();
951   }
952 
953   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
954   return createLibcall(MIRBuilder, Libcall,
955                        {MI.getOperand(0).getReg(), ToType, 0}, Arg, LocObserver,
956                        &MI);
957 }
958 
959 static RTLIB::Libcall
960 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
961   RTLIB::Libcall RTLibcall;
962   switch (MI.getOpcode()) {
963   case TargetOpcode::G_GET_FPENV:
964     RTLibcall = RTLIB::FEGETENV;
965     break;
966   case TargetOpcode::G_SET_FPENV:
967   case TargetOpcode::G_RESET_FPENV:
968     RTLibcall = RTLIB::FESETENV;
969     break;
970   case TargetOpcode::G_GET_FPMODE:
971     RTLibcall = RTLIB::FEGETMODE;
972     break;
973   case TargetOpcode::G_SET_FPMODE:
974   case TargetOpcode::G_RESET_FPMODE:
975     RTLibcall = RTLIB::FESETMODE;
976     break;
977   default:
978     llvm_unreachable("Unexpected opcode");
979   }
980   return RTLibcall;
981 }
982 
983 // Some library functions that read FP state (fegetmode, fegetenv) write the
984 // state into a region in memory. IR intrinsics that do the same operations
985 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
986 // intrinsics via the library functions, we need to use temporary variable,
987 // for example:
988 //
989 //     %0:_(s32) = G_GET_FPMODE
990 //
991 // is transformed to:
992 //
993 //     %1:_(p0) = G_FRAME_INDEX %stack.0
994 //     BL &fegetmode
995 //     %0:_(s32) = G_LOAD % 1
996 //
997 LegalizerHelper::LegalizeResult
998 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
999                                        MachineInstr &MI,
1000                                        LostDebugLocObserver &LocObserver) {
1001   const DataLayout &DL = MIRBuilder.getDataLayout();
1002   auto &MF = MIRBuilder.getMF();
1003   auto &MRI = *MIRBuilder.getMRI();
1004   auto &Ctx = MF.getFunction().getContext();
1005 
1006   // Create temporary, where library function will put the read state.
1007   Register Dst = MI.getOperand(0).getReg();
1008   LLT StateTy = MRI.getType(Dst);
1009   TypeSize StateSize = StateTy.getSizeInBytes();
1010   Align TempAlign = getStackTemporaryAlignment(StateTy);
1011   MachinePointerInfo TempPtrInfo;
1012   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1013 
1014   // Create a call to library function, with the temporary as an argument.
1015   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1016   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1017   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1018   auto Res =
1019       createLibcall(MIRBuilder, RTLibcall,
1020                     CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1021                     CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1022                     LocObserver, nullptr);
1023   if (Res != LegalizerHelper::Legalized)
1024     return Res;
1025 
1026   // Create a load from the temporary.
1027   MachineMemOperand *MMO = MF.getMachineMemOperand(
1028       TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
1029   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
1030 
1031   return LegalizerHelper::Legalized;
1032 }
1033 
1034 // Similar to `createGetStateLibcall` the function calls a library function
1035 // using transient space in stack. In this case the library function reads
1036 // content of memory region.
1037 LegalizerHelper::LegalizeResult
1038 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
1039                                        MachineInstr &MI,
1040                                        LostDebugLocObserver &LocObserver) {
1041   const DataLayout &DL = MIRBuilder.getDataLayout();
1042   auto &MF = MIRBuilder.getMF();
1043   auto &MRI = *MIRBuilder.getMRI();
1044   auto &Ctx = MF.getFunction().getContext();
1045 
1046   // Create temporary, where library function will get the new state.
1047   Register Src = MI.getOperand(0).getReg();
1048   LLT StateTy = MRI.getType(Src);
1049   TypeSize StateSize = StateTy.getSizeInBytes();
1050   Align TempAlign = getStackTemporaryAlignment(StateTy);
1051   MachinePointerInfo TempPtrInfo;
1052   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
1053 
1054   // Put the new state into the temporary.
1055   MachineMemOperand *MMO = MF.getMachineMemOperand(
1056       TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
1057   MIRBuilder.buildStore(Src, Temp, *MMO);
1058 
1059   // Create a call to library function, with the temporary as an argument.
1060   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
1061   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
1062   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1063   return createLibcall(MIRBuilder, RTLibcall,
1064                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1065                        CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
1066                        LocObserver, nullptr);
1067 }
1068 
1069 /// Returns the corresponding libcall for the given Pred and
1070 /// the ICMP predicate that should be generated to compare with #0
1071 /// after the libcall.
1072 static std::pair<RTLIB::Libcall, CmpInst::Predicate>
1073 getFCMPLibcallDesc(const CmpInst::Predicate Pred, unsigned Size) {
1074 #define RTLIBCASE_CMP(LibcallPrefix, ICmpPred)                                 \
1075   do {                                                                         \
1076     switch (Size) {                                                            \
1077     case 32:                                                                   \
1078       return {RTLIB::LibcallPrefix##32, ICmpPred};                             \
1079     case 64:                                                                   \
1080       return {RTLIB::LibcallPrefix##64, ICmpPred};                             \
1081     case 128:                                                                  \
1082       return {RTLIB::LibcallPrefix##128, ICmpPred};                            \
1083     default:                                                                   \
1084       llvm_unreachable("unexpected size");                                     \
1085     }                                                                          \
1086   } while (0)
1087 
1088   switch (Pred) {
1089   case CmpInst::FCMP_OEQ:
1090     RTLIBCASE_CMP(OEQ_F, CmpInst::ICMP_EQ);
1091   case CmpInst::FCMP_UNE:
1092     RTLIBCASE_CMP(UNE_F, CmpInst::ICMP_NE);
1093   case CmpInst::FCMP_OGE:
1094     RTLIBCASE_CMP(OGE_F, CmpInst::ICMP_SGE);
1095   case CmpInst::FCMP_OLT:
1096     RTLIBCASE_CMP(OLT_F, CmpInst::ICMP_SLT);
1097   case CmpInst::FCMP_OLE:
1098     RTLIBCASE_CMP(OLE_F, CmpInst::ICMP_SLE);
1099   case CmpInst::FCMP_OGT:
1100     RTLIBCASE_CMP(OGT_F, CmpInst::ICMP_SGT);
1101   case CmpInst::FCMP_UNO:
1102     RTLIBCASE_CMP(UO_F, CmpInst::ICMP_NE);
1103   default:
1104     return {RTLIB::UNKNOWN_LIBCALL, CmpInst::BAD_ICMP_PREDICATE};
1105   }
1106 }
1107 
1108 LegalizerHelper::LegalizeResult
1109 LegalizerHelper::createFCMPLibcall(MachineIRBuilder &MIRBuilder,
1110                                    MachineInstr &MI,
1111                                    LostDebugLocObserver &LocObserver) {
1112   auto &MF = MIRBuilder.getMF();
1113   auto &Ctx = MF.getFunction().getContext();
1114   const GFCmp *Cmp = cast<GFCmp>(&MI);
1115 
1116   LLT OpLLT = MRI.getType(Cmp->getLHSReg());
1117   unsigned Size = OpLLT.getSizeInBits();
1118   if ((Size != 32 && Size != 64 && Size != 128) ||
1119       OpLLT != MRI.getType(Cmp->getRHSReg()))
1120     return UnableToLegalize;
1121 
1122   Type *OpType = getFloatTypeForLLT(Ctx, OpLLT);
1123 
1124   // DstReg type is s32
1125   const Register DstReg = Cmp->getReg(0);
1126   LLT DstTy = MRI.getType(DstReg);
1127   const auto Cond = Cmp->getCond();
1128 
1129   // Reference:
1130   // https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html#Comparison-functions-1
1131   // Generates a libcall followed by ICMP.
1132   const auto BuildLibcall = [&](const RTLIB::Libcall Libcall,
1133                                 const CmpInst::Predicate ICmpPred,
1134                                 const DstOp &Res) -> Register {
1135     // FCMP libcall always returns an i32, and needs an ICMP with #0.
1136     constexpr LLT TempLLT = LLT::scalar(32);
1137     Register Temp = MRI.createGenericVirtualRegister(TempLLT);
1138     // Generate libcall, holding result in Temp
1139     const auto Status = createLibcall(
1140         MIRBuilder, Libcall, {Temp, Type::getInt32Ty(Ctx), 0},
1141         {{Cmp->getLHSReg(), OpType, 0}, {Cmp->getRHSReg(), OpType, 1}},
1142         LocObserver, &MI);
1143     if (!Status)
1144       return {};
1145 
1146     // Compare temp with #0 to get the final result.
1147     return MIRBuilder
1148         .buildICmp(ICmpPred, Res, Temp, MIRBuilder.buildConstant(TempLLT, 0))
1149         .getReg(0);
1150   };
1151 
1152   // Simple case if we have a direct mapping from predicate to libcall
1153   if (const auto [Libcall, ICmpPred] = getFCMPLibcallDesc(Cond, Size);
1154       Libcall != RTLIB::UNKNOWN_LIBCALL &&
1155       ICmpPred != CmpInst::BAD_ICMP_PREDICATE) {
1156     if (BuildLibcall(Libcall, ICmpPred, DstReg)) {
1157       return Legalized;
1158     }
1159     return UnableToLegalize;
1160   }
1161 
1162   // No direct mapping found, should be generated as combination of libcalls.
1163 
1164   switch (Cond) {
1165   case CmpInst::FCMP_UEQ: {
1166     // FCMP_UEQ: unordered or equal
1167     // Convert into (FCMP_OEQ || FCMP_UNO).
1168 
1169     const auto [OeqLibcall, OeqPred] =
1170         getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1171     const auto Oeq = BuildLibcall(OeqLibcall, OeqPred, DstTy);
1172 
1173     const auto [UnoLibcall, UnoPred] =
1174         getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1175     const auto Uno = BuildLibcall(UnoLibcall, UnoPred, DstTy);
1176     if (Oeq && Uno)
1177       MIRBuilder.buildOr(DstReg, Oeq, Uno);
1178     else
1179       return UnableToLegalize;
1180 
1181     break;
1182   }
1183   case CmpInst::FCMP_ONE: {
1184     // FCMP_ONE: ordered and operands are unequal
1185     // Convert into (!FCMP_OEQ && !FCMP_UNO).
1186 
1187     // We inverse the predicate instead of generating a NOT
1188     // to save one instruction.
1189     // On AArch64 isel can even select two cmp into a single ccmp.
1190     const auto [OeqLibcall, OeqPred] =
1191         getFCMPLibcallDesc(CmpInst::FCMP_OEQ, Size);
1192     const auto NotOeq =
1193         BuildLibcall(OeqLibcall, CmpInst::getInversePredicate(OeqPred), DstTy);
1194 
1195     const auto [UnoLibcall, UnoPred] =
1196         getFCMPLibcallDesc(CmpInst::FCMP_UNO, Size);
1197     const auto NotUno =
1198         BuildLibcall(UnoLibcall, CmpInst::getInversePredicate(UnoPred), DstTy);
1199 
1200     if (NotOeq && NotUno)
1201       MIRBuilder.buildAnd(DstReg, NotOeq, NotUno);
1202     else
1203       return UnableToLegalize;
1204 
1205     break;
1206   }
1207   case CmpInst::FCMP_ULT:
1208   case CmpInst::FCMP_UGE:
1209   case CmpInst::FCMP_UGT:
1210   case CmpInst::FCMP_ULE:
1211   case CmpInst::FCMP_ORD: {
1212     // Convert into: !(inverse(Pred))
1213     // E.g. FCMP_ULT becomes !FCMP_OGE
1214     // This is equivalent to the following, but saves some instructions.
1215     //   MIRBuilder.buildNot(
1216     //       PredTy,
1217     //       MIRBuilder.buildFCmp(CmpInst::getInversePredicate(Pred), PredTy,
1218     //                            Op1, Op2));
1219     const auto [InversedLibcall, InversedPred] =
1220         getFCMPLibcallDesc(CmpInst::getInversePredicate(Cond), Size);
1221     if (!BuildLibcall(InversedLibcall,
1222                       CmpInst::getInversePredicate(InversedPred), DstReg))
1223       return UnableToLegalize;
1224     break;
1225   }
1226   default:
1227     return UnableToLegalize;
1228   }
1229 
1230   return Legalized;
1231 }
1232 
1233 // The function is used to legalize operations that set default environment
1234 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
1235 // On most targets supported in glibc FE_DFL_MODE is defined as
1236 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
1237 // it is not true, the target must provide custom lowering.
1238 LegalizerHelper::LegalizeResult
1239 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
1240                                          MachineInstr &MI,
1241                                          LostDebugLocObserver &LocObserver) {
1242   const DataLayout &DL = MIRBuilder.getDataLayout();
1243   auto &MF = MIRBuilder.getMF();
1244   auto &Ctx = MF.getFunction().getContext();
1245 
1246   // Create an argument for the library function.
1247   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
1248   Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
1249   unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
1250   LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
1251   auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
1252   DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
1253   MIRBuilder.buildIntToPtr(Dest, DefValue);
1254 
1255   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
1256   return createLibcall(MIRBuilder, RTLibcall,
1257                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1258                        CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1259                        LocObserver, &MI);
1260 }
1261 
1262 LegalizerHelper::LegalizeResult
1263 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1264   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1265 
1266   switch (MI.getOpcode()) {
1267   default:
1268     return UnableToLegalize;
1269   case TargetOpcode::G_MUL:
1270   case TargetOpcode::G_SDIV:
1271   case TargetOpcode::G_UDIV:
1272   case TargetOpcode::G_SREM:
1273   case TargetOpcode::G_UREM:
1274   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1275     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1276     unsigned Size = LLTy.getSizeInBits();
1277     Type *HLTy = IntegerType::get(Ctx, Size);
1278     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1279     if (Status != Legalized)
1280       return Status;
1281     break;
1282   }
1283   case TargetOpcode::G_FADD:
1284   case TargetOpcode::G_FSUB:
1285   case TargetOpcode::G_FMUL:
1286   case TargetOpcode::G_FDIV:
1287   case TargetOpcode::G_FMA:
1288   case TargetOpcode::G_FPOW:
1289   case TargetOpcode::G_FREM:
1290   case TargetOpcode::G_FCOS:
1291   case TargetOpcode::G_FSIN:
1292   case TargetOpcode::G_FTAN:
1293   case TargetOpcode::G_FACOS:
1294   case TargetOpcode::G_FASIN:
1295   case TargetOpcode::G_FATAN:
1296   case TargetOpcode::G_FATAN2:
1297   case TargetOpcode::G_FCOSH:
1298   case TargetOpcode::G_FSINH:
1299   case TargetOpcode::G_FTANH:
1300   case TargetOpcode::G_FLOG10:
1301   case TargetOpcode::G_FLOG:
1302   case TargetOpcode::G_FLOG2:
1303   case TargetOpcode::G_FEXP:
1304   case TargetOpcode::G_FEXP2:
1305   case TargetOpcode::G_FEXP10:
1306   case TargetOpcode::G_FCEIL:
1307   case TargetOpcode::G_FFLOOR:
1308   case TargetOpcode::G_FMINNUM:
1309   case TargetOpcode::G_FMAXNUM:
1310   case TargetOpcode::G_FSQRT:
1311   case TargetOpcode::G_FRINT:
1312   case TargetOpcode::G_FNEARBYINT:
1313   case TargetOpcode::G_INTRINSIC_TRUNC:
1314   case TargetOpcode::G_INTRINSIC_ROUND:
1315   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1316     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1317     unsigned Size = LLTy.getSizeInBits();
1318     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1319     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1320       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1321       return UnableToLegalize;
1322     }
1323     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1324     if (Status != Legalized)
1325       return Status;
1326     break;
1327   }
1328   case TargetOpcode::G_FSINCOS: {
1329     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1330     unsigned Size = LLTy.getSizeInBits();
1331     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1332     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1333       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1334       return UnableToLegalize;
1335     }
1336     return emitSincosLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1337   }
1338   case TargetOpcode::G_LROUND:
1339   case TargetOpcode::G_LLROUND:
1340   case TargetOpcode::G_INTRINSIC_LRINT:
1341   case TargetOpcode::G_INTRINSIC_LLRINT: {
1342     LLT LLTy = MRI.getType(MI.getOperand(1).getReg());
1343     unsigned Size = LLTy.getSizeInBits();
1344     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1345     Type *ITy = IntegerType::get(
1346         Ctx, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits());
1347     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1348       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1349       return UnableToLegalize;
1350     }
1351     auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1352     LegalizeResult Status =
1353         createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ITy, 0},
1354                       {{MI.getOperand(1).getReg(), HLTy, 0}}, LocObserver, &MI);
1355     if (Status != Legalized)
1356       return Status;
1357     MI.eraseFromParent();
1358     return Legalized;
1359   }
1360   case TargetOpcode::G_FPOWI:
1361   case TargetOpcode::G_FLDEXP: {
1362     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1363     unsigned Size = LLTy.getSizeInBits();
1364     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1365     Type *ITy = IntegerType::get(
1366         Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1367     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1368       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1369       return UnableToLegalize;
1370     }
1371     auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1372     SmallVector<CallLowering::ArgInfo, 2> Args = {
1373         {MI.getOperand(1).getReg(), HLTy, 0},
1374         {MI.getOperand(2).getReg(), ITy, 1}};
1375     Args[1].Flags[0].setSExt();
1376     LegalizeResult Status =
1377         createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1378                       Args, LocObserver, &MI);
1379     if (Status != Legalized)
1380       return Status;
1381     break;
1382   }
1383   case TargetOpcode::G_FPEXT:
1384   case TargetOpcode::G_FPTRUNC: {
1385     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
1386     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1387     if (!FromTy || !ToTy)
1388       return UnableToLegalize;
1389     LegalizeResult Status =
1390         conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver, TLI);
1391     if (Status != Legalized)
1392       return Status;
1393     break;
1394   }
1395   case TargetOpcode::G_FCMP: {
1396     LegalizeResult Status = createFCMPLibcall(MIRBuilder, MI, LocObserver);
1397     if (Status != Legalized)
1398       return Status;
1399     MI.eraseFromParent();
1400     return Status;
1401   }
1402   case TargetOpcode::G_FPTOSI:
1403   case TargetOpcode::G_FPTOUI: {
1404     // FIXME: Support other types
1405     Type *FromTy =
1406         getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(1).getReg()));
1407     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1408     if ((ToSize != 32 && ToSize != 64 && ToSize != 128) || !FromTy)
1409       return UnableToLegalize;
1410     LegalizeResult Status = conversionLibcall(
1411         MI, MIRBuilder, Type::getIntNTy(Ctx, ToSize), FromTy, LocObserver, TLI);
1412     if (Status != Legalized)
1413       return Status;
1414     break;
1415   }
1416   case TargetOpcode::G_SITOFP:
1417   case TargetOpcode::G_UITOFP: {
1418     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1419     Type *ToTy =
1420         getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1421     if ((FromSize != 32 && FromSize != 64 && FromSize != 128) || !ToTy)
1422       return UnableToLegalize;
1423     bool IsSigned = MI.getOpcode() == TargetOpcode::G_SITOFP;
1424     LegalizeResult Status =
1425         conversionLibcall(MI, MIRBuilder, ToTy, Type::getIntNTy(Ctx, FromSize),
1426                           LocObserver, TLI, IsSigned);
1427     if (Status != Legalized)
1428       return Status;
1429     break;
1430   }
1431   case TargetOpcode::G_ATOMICRMW_XCHG:
1432   case TargetOpcode::G_ATOMICRMW_ADD:
1433   case TargetOpcode::G_ATOMICRMW_SUB:
1434   case TargetOpcode::G_ATOMICRMW_AND:
1435   case TargetOpcode::G_ATOMICRMW_OR:
1436   case TargetOpcode::G_ATOMICRMW_XOR:
1437   case TargetOpcode::G_ATOMIC_CMPXCHG:
1438   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1439     auto Status = createAtomicLibcall(MIRBuilder, MI);
1440     if (Status != Legalized)
1441       return Status;
1442     break;
1443   }
1444   case TargetOpcode::G_BZERO:
1445   case TargetOpcode::G_MEMCPY:
1446   case TargetOpcode::G_MEMMOVE:
1447   case TargetOpcode::G_MEMSET: {
1448     LegalizeResult Result =
1449         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1450     if (Result != Legalized)
1451       return Result;
1452     MI.eraseFromParent();
1453     return Result;
1454   }
1455   case TargetOpcode::G_GET_FPENV:
1456   case TargetOpcode::G_GET_FPMODE: {
1457     LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1458     if (Result != Legalized)
1459       return Result;
1460     break;
1461   }
1462   case TargetOpcode::G_SET_FPENV:
1463   case TargetOpcode::G_SET_FPMODE: {
1464     LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1465     if (Result != Legalized)
1466       return Result;
1467     break;
1468   }
1469   case TargetOpcode::G_RESET_FPENV:
1470   case TargetOpcode::G_RESET_FPMODE: {
1471     LegalizeResult Result =
1472         createResetStateLibcall(MIRBuilder, MI, LocObserver);
1473     if (Result != Legalized)
1474       return Result;
1475     break;
1476   }
1477   }
1478 
1479   MI.eraseFromParent();
1480   return Legalized;
1481 }
1482 
1483 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1484                                                               unsigned TypeIdx,
1485                                                               LLT NarrowTy) {
1486   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1487   uint64_t NarrowSize = NarrowTy.getSizeInBits();
1488 
1489   switch (MI.getOpcode()) {
1490   default:
1491     return UnableToLegalize;
1492   case TargetOpcode::G_IMPLICIT_DEF: {
1493     Register DstReg = MI.getOperand(0).getReg();
1494     LLT DstTy = MRI.getType(DstReg);
1495 
1496     // If SizeOp0 is not an exact multiple of NarrowSize, emit
1497     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1498     // FIXME: Although this would also be legal for the general case, it causes
1499     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
1500     //  combines not being hit). This seems to be a problem related to the
1501     //  artifact combiner.
1502     if (SizeOp0 % NarrowSize != 0) {
1503       LLT ImplicitTy = NarrowTy;
1504       if (DstTy.isVector())
1505         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1506 
1507       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1508       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1509 
1510       MI.eraseFromParent();
1511       return Legalized;
1512     }
1513 
1514     int NumParts = SizeOp0 / NarrowSize;
1515 
1516     SmallVector<Register, 2> DstRegs;
1517     for (int i = 0; i < NumParts; ++i)
1518       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1519 
1520     if (DstTy.isVector())
1521       MIRBuilder.buildBuildVector(DstReg, DstRegs);
1522     else
1523       MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1524     MI.eraseFromParent();
1525     return Legalized;
1526   }
1527   case TargetOpcode::G_CONSTANT: {
1528     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1529     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1530     unsigned TotalSize = Ty.getSizeInBits();
1531     unsigned NarrowSize = NarrowTy.getSizeInBits();
1532     int NumParts = TotalSize / NarrowSize;
1533 
1534     SmallVector<Register, 4> PartRegs;
1535     for (int I = 0; I != NumParts; ++I) {
1536       unsigned Offset = I * NarrowSize;
1537       auto K = MIRBuilder.buildConstant(NarrowTy,
1538                                         Val.lshr(Offset).trunc(NarrowSize));
1539       PartRegs.push_back(K.getReg(0));
1540     }
1541 
1542     LLT LeftoverTy;
1543     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1544     SmallVector<Register, 1> LeftoverRegs;
1545     if (LeftoverBits != 0) {
1546       LeftoverTy = LLT::scalar(LeftoverBits);
1547       auto K = MIRBuilder.buildConstant(
1548         LeftoverTy,
1549         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1550       LeftoverRegs.push_back(K.getReg(0));
1551     }
1552 
1553     insertParts(MI.getOperand(0).getReg(),
1554                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1555 
1556     MI.eraseFromParent();
1557     return Legalized;
1558   }
1559   case TargetOpcode::G_SEXT:
1560   case TargetOpcode::G_ZEXT:
1561   case TargetOpcode::G_ANYEXT:
1562     return narrowScalarExt(MI, TypeIdx, NarrowTy);
1563   case TargetOpcode::G_TRUNC: {
1564     if (TypeIdx != 1)
1565       return UnableToLegalize;
1566 
1567     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1568     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1569       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1570       return UnableToLegalize;
1571     }
1572 
1573     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1574     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1575     MI.eraseFromParent();
1576     return Legalized;
1577   }
1578   case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
1579   case TargetOpcode::G_FREEZE: {
1580     if (TypeIdx != 0)
1581       return UnableToLegalize;
1582 
1583     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1584     // Should widen scalar first
1585     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1586       return UnableToLegalize;
1587 
1588     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1589     SmallVector<Register, 8> Parts;
1590     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1591       Parts.push_back(
1592           MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, {Unmerge.getReg(i)})
1593               .getReg(0));
1594     }
1595 
1596     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1597     MI.eraseFromParent();
1598     return Legalized;
1599   }
1600   case TargetOpcode::G_ADD:
1601   case TargetOpcode::G_SUB:
1602   case TargetOpcode::G_SADDO:
1603   case TargetOpcode::G_SSUBO:
1604   case TargetOpcode::G_SADDE:
1605   case TargetOpcode::G_SSUBE:
1606   case TargetOpcode::G_UADDO:
1607   case TargetOpcode::G_USUBO:
1608   case TargetOpcode::G_UADDE:
1609   case TargetOpcode::G_USUBE:
1610     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1611   case TargetOpcode::G_MUL:
1612   case TargetOpcode::G_UMULH:
1613     return narrowScalarMul(MI, NarrowTy);
1614   case TargetOpcode::G_EXTRACT:
1615     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1616   case TargetOpcode::G_INSERT:
1617     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1618   case TargetOpcode::G_LOAD: {
1619     auto &LoadMI = cast<GLoad>(MI);
1620     Register DstReg = LoadMI.getDstReg();
1621     LLT DstTy = MRI.getType(DstReg);
1622     if (DstTy.isVector())
1623       return UnableToLegalize;
1624 
1625     if (8 * LoadMI.getMemSize().getValue() != DstTy.getSizeInBits()) {
1626       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1627       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1628       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1629       LoadMI.eraseFromParent();
1630       return Legalized;
1631     }
1632 
1633     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1634   }
1635   case TargetOpcode::G_ZEXTLOAD:
1636   case TargetOpcode::G_SEXTLOAD: {
1637     auto &LoadMI = cast<GExtLoad>(MI);
1638     Register DstReg = LoadMI.getDstReg();
1639     Register PtrReg = LoadMI.getPointerReg();
1640 
1641     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1642     auto &MMO = LoadMI.getMMO();
1643     unsigned MemSize = MMO.getSizeInBits().getValue();
1644 
1645     if (MemSize == NarrowSize) {
1646       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1647     } else if (MemSize < NarrowSize) {
1648       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1649     } else if (MemSize > NarrowSize) {
1650       // FIXME: Need to split the load.
1651       return UnableToLegalize;
1652     }
1653 
1654     if (isa<GZExtLoad>(LoadMI))
1655       MIRBuilder.buildZExt(DstReg, TmpReg);
1656     else
1657       MIRBuilder.buildSExt(DstReg, TmpReg);
1658 
1659     LoadMI.eraseFromParent();
1660     return Legalized;
1661   }
1662   case TargetOpcode::G_STORE: {
1663     auto &StoreMI = cast<GStore>(MI);
1664 
1665     Register SrcReg = StoreMI.getValueReg();
1666     LLT SrcTy = MRI.getType(SrcReg);
1667     if (SrcTy.isVector())
1668       return UnableToLegalize;
1669 
1670     int NumParts = SizeOp0 / NarrowSize;
1671     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1672     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1673     if (SrcTy.isVector() && LeftoverBits != 0)
1674       return UnableToLegalize;
1675 
1676     if (8 * StoreMI.getMemSize().getValue() != SrcTy.getSizeInBits()) {
1677       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1678       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1679       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1680       StoreMI.eraseFromParent();
1681       return Legalized;
1682     }
1683 
1684     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1685   }
1686   case TargetOpcode::G_SELECT:
1687     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1688   case TargetOpcode::G_AND:
1689   case TargetOpcode::G_OR:
1690   case TargetOpcode::G_XOR: {
1691     // Legalize bitwise operation:
1692     // A = BinOp<Ty> B, C
1693     // into:
1694     // B1, ..., BN = G_UNMERGE_VALUES B
1695     // C1, ..., CN = G_UNMERGE_VALUES C
1696     // A1 = BinOp<Ty/N> B1, C2
1697     // ...
1698     // AN = BinOp<Ty/N> BN, CN
1699     // A = G_MERGE_VALUES A1, ..., AN
1700     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1701   }
1702   case TargetOpcode::G_SHL:
1703   case TargetOpcode::G_LSHR:
1704   case TargetOpcode::G_ASHR:
1705     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1706   case TargetOpcode::G_CTLZ:
1707   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1708   case TargetOpcode::G_CTTZ:
1709   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1710   case TargetOpcode::G_CTPOP:
1711     if (TypeIdx == 1)
1712       switch (MI.getOpcode()) {
1713       case TargetOpcode::G_CTLZ:
1714       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1715         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1716       case TargetOpcode::G_CTTZ:
1717       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1718         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1719       case TargetOpcode::G_CTPOP:
1720         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1721       default:
1722         return UnableToLegalize;
1723       }
1724 
1725     Observer.changingInstr(MI);
1726     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1727     Observer.changedInstr(MI);
1728     return Legalized;
1729   case TargetOpcode::G_INTTOPTR:
1730     if (TypeIdx != 1)
1731       return UnableToLegalize;
1732 
1733     Observer.changingInstr(MI);
1734     narrowScalarSrc(MI, NarrowTy, 1);
1735     Observer.changedInstr(MI);
1736     return Legalized;
1737   case TargetOpcode::G_PTRTOINT:
1738     if (TypeIdx != 0)
1739       return UnableToLegalize;
1740 
1741     Observer.changingInstr(MI);
1742     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1743     Observer.changedInstr(MI);
1744     return Legalized;
1745   case TargetOpcode::G_PHI: {
1746     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1747     // NarrowSize.
1748     if (SizeOp0 % NarrowSize != 0)
1749       return UnableToLegalize;
1750 
1751     unsigned NumParts = SizeOp0 / NarrowSize;
1752     SmallVector<Register, 2> DstRegs(NumParts);
1753     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1754     Observer.changingInstr(MI);
1755     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1756       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1757       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1758       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1759                    SrcRegs[i / 2], MIRBuilder, MRI);
1760     }
1761     MachineBasicBlock &MBB = *MI.getParent();
1762     MIRBuilder.setInsertPt(MBB, MI);
1763     for (unsigned i = 0; i < NumParts; ++i) {
1764       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1765       MachineInstrBuilder MIB =
1766           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1767       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1768         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1769     }
1770     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1771     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1772     Observer.changedInstr(MI);
1773     MI.eraseFromParent();
1774     return Legalized;
1775   }
1776   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1777   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1778     if (TypeIdx != 2)
1779       return UnableToLegalize;
1780 
1781     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1782     Observer.changingInstr(MI);
1783     narrowScalarSrc(MI, NarrowTy, OpIdx);
1784     Observer.changedInstr(MI);
1785     return Legalized;
1786   }
1787   case TargetOpcode::G_ICMP: {
1788     Register LHS = MI.getOperand(2).getReg();
1789     LLT SrcTy = MRI.getType(LHS);
1790     CmpInst::Predicate Pred =
1791         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1792 
1793     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1794     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1795     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1796                       LHSLeftoverRegs, MIRBuilder, MRI))
1797       return UnableToLegalize;
1798 
1799     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1800     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1801     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1802                       RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
1803       return UnableToLegalize;
1804 
1805     // We now have the LHS and RHS of the compare split into narrow-type
1806     // registers, plus potentially some leftover type.
1807     Register Dst = MI.getOperand(0).getReg();
1808     LLT ResTy = MRI.getType(Dst);
1809     if (ICmpInst::isEquality(Pred)) {
1810       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1811       // them together. For each equal part, the result should be all 0s. For
1812       // each non-equal part, we'll get at least one 1.
1813       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1814       SmallVector<Register, 4> Xors;
1815       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1816         auto LHS = std::get<0>(LHSAndRHS);
1817         auto RHS = std::get<1>(LHSAndRHS);
1818         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1819         Xors.push_back(Xor);
1820       }
1821 
1822       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1823       // to the desired narrow type so that we can OR them together later.
1824       SmallVector<Register, 4> WidenedXors;
1825       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1826         auto LHS = std::get<0>(LHSAndRHS);
1827         auto RHS = std::get<1>(LHSAndRHS);
1828         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1829         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1830         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1831                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1832         llvm::append_range(Xors, WidenedXors);
1833       }
1834 
1835       // Now, for each part we broke up, we know if they are equal/not equal
1836       // based off the G_XOR. We can OR these all together and compare against
1837       // 0 to get the result.
1838       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1839       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1840       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1841         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1842       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1843     } else {
1844       Register CmpIn;
1845       for (unsigned I = 0, E = LHSPartRegs.size(); I != E; ++I) {
1846         Register CmpOut;
1847         CmpInst::Predicate PartPred;
1848 
1849         if (I == E - 1 && LHSLeftoverRegs.empty()) {
1850           PartPred = Pred;
1851           CmpOut = Dst;
1852         } else {
1853           PartPred = ICmpInst::getUnsignedPredicate(Pred);
1854           CmpOut = MRI.createGenericVirtualRegister(ResTy);
1855         }
1856 
1857         if (!CmpIn) {
1858           MIRBuilder.buildICmp(PartPred, CmpOut, LHSPartRegs[I],
1859                                RHSPartRegs[I]);
1860         } else {
1861           auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSPartRegs[I],
1862                                           RHSPartRegs[I]);
1863           auto CmpEq = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1864                                             LHSPartRegs[I], RHSPartRegs[I]);
1865           MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1866         }
1867 
1868         CmpIn = CmpOut;
1869       }
1870 
1871       for (unsigned I = 0, E = LHSLeftoverRegs.size(); I != E; ++I) {
1872         Register CmpOut;
1873         CmpInst::Predicate PartPred;
1874 
1875         if (I == E - 1 && LHSLeftoverRegs.empty()) {
1876           PartPred = Pred;
1877           CmpOut = Dst;
1878         } else {
1879           PartPred = ICmpInst::getUnsignedPredicate(Pred);
1880           CmpOut = MRI.createGenericVirtualRegister(ResTy);
1881         }
1882 
1883         if (!CmpIn) {
1884           MIRBuilder.buildICmp(PartPred, CmpOut, LHSLeftoverRegs[I],
1885                                RHSLeftoverRegs[I]);
1886         } else {
1887           auto Cmp = MIRBuilder.buildICmp(PartPred, ResTy, LHSLeftoverRegs[I],
1888                                           RHSLeftoverRegs[I]);
1889           auto CmpEq =
1890               MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy,
1891                                    LHSLeftoverRegs[I], RHSLeftoverRegs[I]);
1892           MIRBuilder.buildSelect(CmpOut, CmpEq, CmpIn, Cmp);
1893         }
1894 
1895         CmpIn = CmpOut;
1896       }
1897     }
1898     MI.eraseFromParent();
1899     return Legalized;
1900   }
1901   case TargetOpcode::G_FCMP:
1902     if (TypeIdx != 0)
1903       return UnableToLegalize;
1904 
1905     Observer.changingInstr(MI);
1906     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1907     Observer.changedInstr(MI);
1908     return Legalized;
1909 
1910   case TargetOpcode::G_SEXT_INREG: {
1911     if (TypeIdx != 0)
1912       return UnableToLegalize;
1913 
1914     int64_t SizeInBits = MI.getOperand(2).getImm();
1915 
1916     // So long as the new type has more bits than the bits we're extending we
1917     // don't need to break it apart.
1918     if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1919       Observer.changingInstr(MI);
1920       // We don't lose any non-extension bits by truncating the src and
1921       // sign-extending the dst.
1922       MachineOperand &MO1 = MI.getOperand(1);
1923       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1924       MO1.setReg(TruncMIB.getReg(0));
1925 
1926       MachineOperand &MO2 = MI.getOperand(0);
1927       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1928       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1929       MIRBuilder.buildSExt(MO2, DstExt);
1930       MO2.setReg(DstExt);
1931       Observer.changedInstr(MI);
1932       return Legalized;
1933     }
1934 
1935     // Break it apart. Components below the extension point are unmodified. The
1936     // component containing the extension point becomes a narrower SEXT_INREG.
1937     // Components above it are ashr'd from the component containing the
1938     // extension point.
1939     if (SizeOp0 % NarrowSize != 0)
1940       return UnableToLegalize;
1941     int NumParts = SizeOp0 / NarrowSize;
1942 
1943     // List the registers where the destination will be scattered.
1944     SmallVector<Register, 2> DstRegs;
1945     // List the registers where the source will be split.
1946     SmallVector<Register, 2> SrcRegs;
1947 
1948     // Create all the temporary registers.
1949     for (int i = 0; i < NumParts; ++i) {
1950       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1951 
1952       SrcRegs.push_back(SrcReg);
1953     }
1954 
1955     // Explode the big arguments into smaller chunks.
1956     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1957 
1958     Register AshrCstReg =
1959         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1960             .getReg(0);
1961     Register FullExtensionReg;
1962     Register PartialExtensionReg;
1963 
1964     // Do the operation on each small part.
1965     for (int i = 0; i < NumParts; ++i) {
1966       if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1967         DstRegs.push_back(SrcRegs[i]);
1968         PartialExtensionReg = DstRegs.back();
1969       } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1970         assert(PartialExtensionReg &&
1971                "Expected to visit partial extension before full");
1972         if (FullExtensionReg) {
1973           DstRegs.push_back(FullExtensionReg);
1974           continue;
1975         }
1976         DstRegs.push_back(
1977             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1978                 .getReg(0));
1979         FullExtensionReg = DstRegs.back();
1980       } else {
1981         DstRegs.push_back(
1982             MIRBuilder
1983                 .buildInstr(
1984                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1985                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1986                 .getReg(0));
1987         PartialExtensionReg = DstRegs.back();
1988       }
1989     }
1990 
1991     // Gather the destination registers into the final destination.
1992     Register DstReg = MI.getOperand(0).getReg();
1993     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1994     MI.eraseFromParent();
1995     return Legalized;
1996   }
1997   case TargetOpcode::G_BSWAP:
1998   case TargetOpcode::G_BITREVERSE: {
1999     if (SizeOp0 % NarrowSize != 0)
2000       return UnableToLegalize;
2001 
2002     Observer.changingInstr(MI);
2003     SmallVector<Register, 2> SrcRegs, DstRegs;
2004     unsigned NumParts = SizeOp0 / NarrowSize;
2005     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
2006                  MIRBuilder, MRI);
2007 
2008     for (unsigned i = 0; i < NumParts; ++i) {
2009       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
2010                                            {SrcRegs[NumParts - 1 - i]});
2011       DstRegs.push_back(DstPart.getReg(0));
2012     }
2013 
2014     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
2015 
2016     Observer.changedInstr(MI);
2017     MI.eraseFromParent();
2018     return Legalized;
2019   }
2020   case TargetOpcode::G_PTR_ADD:
2021   case TargetOpcode::G_PTRMASK: {
2022     if (TypeIdx != 1)
2023       return UnableToLegalize;
2024     Observer.changingInstr(MI);
2025     narrowScalarSrc(MI, NarrowTy, 2);
2026     Observer.changedInstr(MI);
2027     return Legalized;
2028   }
2029   case TargetOpcode::G_FPTOUI:
2030   case TargetOpcode::G_FPTOSI:
2031   case TargetOpcode::G_FPTOUI_SAT:
2032   case TargetOpcode::G_FPTOSI_SAT:
2033     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
2034   case TargetOpcode::G_FPEXT:
2035     if (TypeIdx != 0)
2036       return UnableToLegalize;
2037     Observer.changingInstr(MI);
2038     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
2039     Observer.changedInstr(MI);
2040     return Legalized;
2041   case TargetOpcode::G_FLDEXP:
2042   case TargetOpcode::G_STRICT_FLDEXP:
2043     return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
2044   case TargetOpcode::G_VSCALE: {
2045     Register Dst = MI.getOperand(0).getReg();
2046     LLT Ty = MRI.getType(Dst);
2047 
2048     // Assume VSCALE(1) fits into a legal integer
2049     const APInt One(NarrowTy.getSizeInBits(), 1);
2050     auto VScaleBase = MIRBuilder.buildVScale(NarrowTy, One);
2051     auto ZExt = MIRBuilder.buildZExt(Ty, VScaleBase);
2052     auto C = MIRBuilder.buildConstant(Ty, *MI.getOperand(1).getCImm());
2053     MIRBuilder.buildMul(Dst, ZExt, C);
2054 
2055     MI.eraseFromParent();
2056     return Legalized;
2057   }
2058   }
2059 }
2060 
2061 Register LegalizerHelper::coerceToScalar(Register Val) {
2062   LLT Ty = MRI.getType(Val);
2063   if (Ty.isScalar())
2064     return Val;
2065 
2066   const DataLayout &DL = MIRBuilder.getDataLayout();
2067   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
2068   if (Ty.isPointer()) {
2069     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
2070       return Register();
2071     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
2072   }
2073 
2074   Register NewVal = Val;
2075 
2076   assert(Ty.isVector());
2077   if (Ty.isPointerVector())
2078     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
2079   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
2080 }
2081 
2082 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
2083                                      unsigned OpIdx, unsigned ExtOpcode) {
2084   MachineOperand &MO = MI.getOperand(OpIdx);
2085   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
2086   MO.setReg(ExtB.getReg(0));
2087 }
2088 
2089 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
2090                                       unsigned OpIdx) {
2091   MachineOperand &MO = MI.getOperand(OpIdx);
2092   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
2093   MO.setReg(ExtB.getReg(0));
2094 }
2095 
2096 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
2097                                      unsigned OpIdx, unsigned TruncOpcode) {
2098   MachineOperand &MO = MI.getOperand(OpIdx);
2099   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2100   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2101   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
2102   MO.setReg(DstExt);
2103 }
2104 
2105 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
2106                                       unsigned OpIdx, unsigned ExtOpcode) {
2107   MachineOperand &MO = MI.getOperand(OpIdx);
2108   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
2109   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2110   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
2111   MO.setReg(DstTrunc);
2112 }
2113 
2114 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
2115                                             unsigned OpIdx) {
2116   MachineOperand &MO = MI.getOperand(OpIdx);
2117   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2118   Register Dst = MO.getReg();
2119   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2120   MO.setReg(DstExt);
2121   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
2122 }
2123 
2124 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
2125                                             unsigned OpIdx) {
2126   MachineOperand &MO = MI.getOperand(OpIdx);
2127   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
2128 }
2129 
2130 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2131   MachineOperand &Op = MI.getOperand(OpIdx);
2132   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
2133 }
2134 
2135 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
2136   MachineOperand &MO = MI.getOperand(OpIdx);
2137   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
2138   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2139   MIRBuilder.buildBitcast(MO, CastDst);
2140   MO.setReg(CastDst);
2141 }
2142 
2143 LegalizerHelper::LegalizeResult
2144 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
2145                                         LLT WideTy) {
2146   if (TypeIdx != 1)
2147     return UnableToLegalize;
2148 
2149   auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
2150   if (DstTy.isVector())
2151     return UnableToLegalize;
2152 
2153   LLT SrcTy = MRI.getType(Src1Reg);
2154   const int DstSize = DstTy.getSizeInBits();
2155   const int SrcSize = SrcTy.getSizeInBits();
2156   const int WideSize = WideTy.getSizeInBits();
2157   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
2158 
2159   unsigned NumOps = MI.getNumOperands();
2160   unsigned NumSrc = MI.getNumOperands() - 1;
2161   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
2162 
2163   if (WideSize >= DstSize) {
2164     // Directly pack the bits in the target type.
2165     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
2166 
2167     for (unsigned I = 2; I != NumOps; ++I) {
2168       const unsigned Offset = (I - 1) * PartSize;
2169 
2170       Register SrcReg = MI.getOperand(I).getReg();
2171       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
2172 
2173       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
2174 
2175       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
2176         MRI.createGenericVirtualRegister(WideTy);
2177 
2178       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
2179       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
2180       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
2181       ResultReg = NextResult;
2182     }
2183 
2184     if (WideSize > DstSize)
2185       MIRBuilder.buildTrunc(DstReg, ResultReg);
2186     else if (DstTy.isPointer())
2187       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
2188 
2189     MI.eraseFromParent();
2190     return Legalized;
2191   }
2192 
2193   // Unmerge the original values to the GCD type, and recombine to the next
2194   // multiple greater than the original type.
2195   //
2196   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
2197   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
2198   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
2199   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
2200   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
2201   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
2202   // %12:_(s12) = G_MERGE_VALUES %10, %11
2203   //
2204   // Padding with undef if necessary:
2205   //
2206   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
2207   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
2208   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
2209   // %7:_(s2) = G_IMPLICIT_DEF
2210   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
2211   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
2212   // %10:_(s12) = G_MERGE_VALUES %8, %9
2213 
2214   const int GCD = std::gcd(SrcSize, WideSize);
2215   LLT GCDTy = LLT::scalar(GCD);
2216 
2217   SmallVector<Register, 8> NewMergeRegs;
2218   SmallVector<Register, 8> Unmerges;
2219   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
2220 
2221   // Decompose the original operands if they don't evenly divide.
2222   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
2223     Register SrcReg = MO.getReg();
2224     if (GCD == SrcSize) {
2225       Unmerges.push_back(SrcReg);
2226     } else {
2227       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
2228       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
2229         Unmerges.push_back(Unmerge.getReg(J));
2230     }
2231   }
2232 
2233   // Pad with undef to the next size that is a multiple of the requested size.
2234   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
2235     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
2236     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
2237       Unmerges.push_back(UndefReg);
2238   }
2239 
2240   const int PartsPerGCD = WideSize / GCD;
2241 
2242   // Build merges of each piece.
2243   ArrayRef<Register> Slicer(Unmerges);
2244   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
2245     auto Merge =
2246         MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
2247     NewMergeRegs.push_back(Merge.getReg(0));
2248   }
2249 
2250   // A truncate may be necessary if the requested type doesn't evenly divide the
2251   // original result type.
2252   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
2253     MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
2254   } else {
2255     auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
2256     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
2257   }
2258 
2259   MI.eraseFromParent();
2260   return Legalized;
2261 }
2262 
2263 LegalizerHelper::LegalizeResult
2264 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
2265                                           LLT WideTy) {
2266   if (TypeIdx != 0)
2267     return UnableToLegalize;
2268 
2269   int NumDst = MI.getNumOperands() - 1;
2270   Register SrcReg = MI.getOperand(NumDst).getReg();
2271   LLT SrcTy = MRI.getType(SrcReg);
2272   if (SrcTy.isVector())
2273     return UnableToLegalize;
2274 
2275   Register Dst0Reg = MI.getOperand(0).getReg();
2276   LLT DstTy = MRI.getType(Dst0Reg);
2277   if (!DstTy.isScalar())
2278     return UnableToLegalize;
2279 
2280   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
2281     if (SrcTy.isPointer()) {
2282       const DataLayout &DL = MIRBuilder.getDataLayout();
2283       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
2284         LLVM_DEBUG(
2285             dbgs() << "Not casting non-integral address space integer\n");
2286         return UnableToLegalize;
2287       }
2288 
2289       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
2290       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
2291     }
2292 
2293     // Widen SrcTy to WideTy. This does not affect the result, but since the
2294     // user requested this size, it is probably better handled than SrcTy and
2295     // should reduce the total number of legalization artifacts.
2296     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2297       SrcTy = WideTy;
2298       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
2299     }
2300 
2301     // Theres no unmerge type to target. Directly extract the bits from the
2302     // source type
2303     unsigned DstSize = DstTy.getSizeInBits();
2304 
2305     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
2306     for (int I = 1; I != NumDst; ++I) {
2307       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
2308       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
2309       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
2310     }
2311 
2312     MI.eraseFromParent();
2313     return Legalized;
2314   }
2315 
2316   // Extend the source to a wider type.
2317   LLT LCMTy = getLCMType(SrcTy, WideTy);
2318 
2319   Register WideSrc = SrcReg;
2320   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
2321     // TODO: If this is an integral address space, cast to integer and anyext.
2322     if (SrcTy.isPointer()) {
2323       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
2324       return UnableToLegalize;
2325     }
2326 
2327     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
2328   }
2329 
2330   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
2331 
2332   // Create a sequence of unmerges and merges to the original results. Since we
2333   // may have widened the source, we will need to pad the results with dead defs
2334   // to cover the source register.
2335   // e.g. widen s48 to s64:
2336   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
2337   //
2338   // =>
2339   //  %4:_(s192) = G_ANYEXT %0:_(s96)
2340   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
2341   //  ; unpack to GCD type, with extra dead defs
2342   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
2343   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
2344   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
2345   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
2346   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
2347   const LLT GCDTy = getGCDType(WideTy, DstTy);
2348   const int NumUnmerge = Unmerge->getNumOperands() - 1;
2349   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
2350 
2351   // Directly unmerge to the destination without going through a GCD type
2352   // if possible
2353   if (PartsPerRemerge == 1) {
2354     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
2355 
2356     for (int I = 0; I != NumUnmerge; ++I) {
2357       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
2358 
2359       for (int J = 0; J != PartsPerUnmerge; ++J) {
2360         int Idx = I * PartsPerUnmerge + J;
2361         if (Idx < NumDst)
2362           MIB.addDef(MI.getOperand(Idx).getReg());
2363         else {
2364           // Create dead def for excess components.
2365           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2366         }
2367       }
2368 
2369       MIB.addUse(Unmerge.getReg(I));
2370     }
2371   } else {
2372     SmallVector<Register, 16> Parts;
2373     for (int J = 0; J != NumUnmerge; ++J)
2374       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2375 
2376     SmallVector<Register, 8> RemergeParts;
2377     for (int I = 0; I != NumDst; ++I) {
2378       for (int J = 0; J < PartsPerRemerge; ++J) {
2379         const int Idx = I * PartsPerRemerge + J;
2380         RemergeParts.emplace_back(Parts[Idx]);
2381       }
2382 
2383       MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2384       RemergeParts.clear();
2385     }
2386   }
2387 
2388   MI.eraseFromParent();
2389   return Legalized;
2390 }
2391 
2392 LegalizerHelper::LegalizeResult
2393 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2394                                     LLT WideTy) {
2395   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2396   unsigned Offset = MI.getOperand(2).getImm();
2397 
2398   if (TypeIdx == 0) {
2399     if (SrcTy.isVector() || DstTy.isVector())
2400       return UnableToLegalize;
2401 
2402     SrcOp Src(SrcReg);
2403     if (SrcTy.isPointer()) {
2404       // Extracts from pointers can be handled only if they are really just
2405       // simple integers.
2406       const DataLayout &DL = MIRBuilder.getDataLayout();
2407       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2408         return UnableToLegalize;
2409 
2410       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2411       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2412       SrcTy = SrcAsIntTy;
2413     }
2414 
2415     if (DstTy.isPointer())
2416       return UnableToLegalize;
2417 
2418     if (Offset == 0) {
2419       // Avoid a shift in the degenerate case.
2420       MIRBuilder.buildTrunc(DstReg,
2421                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2422       MI.eraseFromParent();
2423       return Legalized;
2424     }
2425 
2426     // Do a shift in the source type.
2427     LLT ShiftTy = SrcTy;
2428     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2429       Src = MIRBuilder.buildAnyExt(WideTy, Src);
2430       ShiftTy = WideTy;
2431     }
2432 
2433     auto LShr = MIRBuilder.buildLShr(
2434       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2435     MIRBuilder.buildTrunc(DstReg, LShr);
2436     MI.eraseFromParent();
2437     return Legalized;
2438   }
2439 
2440   if (SrcTy.isScalar()) {
2441     Observer.changingInstr(MI);
2442     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2443     Observer.changedInstr(MI);
2444     return Legalized;
2445   }
2446 
2447   if (!SrcTy.isVector())
2448     return UnableToLegalize;
2449 
2450   if (DstTy != SrcTy.getElementType())
2451     return UnableToLegalize;
2452 
2453   if (Offset % SrcTy.getScalarSizeInBits() != 0)
2454     return UnableToLegalize;
2455 
2456   Observer.changingInstr(MI);
2457   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2458 
2459   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2460                           Offset);
2461   widenScalarDst(MI, WideTy.getScalarType(), 0);
2462   Observer.changedInstr(MI);
2463   return Legalized;
2464 }
2465 
2466 LegalizerHelper::LegalizeResult
2467 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2468                                    LLT WideTy) {
2469   if (TypeIdx != 0 || WideTy.isVector())
2470     return UnableToLegalize;
2471   Observer.changingInstr(MI);
2472   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2473   widenScalarDst(MI, WideTy);
2474   Observer.changedInstr(MI);
2475   return Legalized;
2476 }
2477 
2478 LegalizerHelper::LegalizeResult
2479 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2480                                            LLT WideTy) {
2481   unsigned Opcode;
2482   unsigned ExtOpcode;
2483   std::optional<Register> CarryIn;
2484   switch (MI.getOpcode()) {
2485   default:
2486     llvm_unreachable("Unexpected opcode!");
2487   case TargetOpcode::G_SADDO:
2488     Opcode = TargetOpcode::G_ADD;
2489     ExtOpcode = TargetOpcode::G_SEXT;
2490     break;
2491   case TargetOpcode::G_SSUBO:
2492     Opcode = TargetOpcode::G_SUB;
2493     ExtOpcode = TargetOpcode::G_SEXT;
2494     break;
2495   case TargetOpcode::G_UADDO:
2496     Opcode = TargetOpcode::G_ADD;
2497     ExtOpcode = TargetOpcode::G_ZEXT;
2498     break;
2499   case TargetOpcode::G_USUBO:
2500     Opcode = TargetOpcode::G_SUB;
2501     ExtOpcode = TargetOpcode::G_ZEXT;
2502     break;
2503   case TargetOpcode::G_SADDE:
2504     Opcode = TargetOpcode::G_UADDE;
2505     ExtOpcode = TargetOpcode::G_SEXT;
2506     CarryIn = MI.getOperand(4).getReg();
2507     break;
2508   case TargetOpcode::G_SSUBE:
2509     Opcode = TargetOpcode::G_USUBE;
2510     ExtOpcode = TargetOpcode::G_SEXT;
2511     CarryIn = MI.getOperand(4).getReg();
2512     break;
2513   case TargetOpcode::G_UADDE:
2514     Opcode = TargetOpcode::G_UADDE;
2515     ExtOpcode = TargetOpcode::G_ZEXT;
2516     CarryIn = MI.getOperand(4).getReg();
2517     break;
2518   case TargetOpcode::G_USUBE:
2519     Opcode = TargetOpcode::G_USUBE;
2520     ExtOpcode = TargetOpcode::G_ZEXT;
2521     CarryIn = MI.getOperand(4).getReg();
2522     break;
2523   }
2524 
2525   if (TypeIdx == 1) {
2526     unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2527 
2528     Observer.changingInstr(MI);
2529     if (CarryIn)
2530       widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2531     widenScalarDst(MI, WideTy, 1);
2532 
2533     Observer.changedInstr(MI);
2534     return Legalized;
2535   }
2536 
2537   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2538   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2539   // Do the arithmetic in the larger type.
2540   Register NewOp;
2541   if (CarryIn) {
2542     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2543     NewOp = MIRBuilder
2544                 .buildInstr(Opcode, {WideTy, CarryOutTy},
2545                             {LHSExt, RHSExt, *CarryIn})
2546                 .getReg(0);
2547   } else {
2548     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2549   }
2550   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2551   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2552   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2553   // There is no overflow if the ExtOp is the same as NewOp.
2554   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2555   // Now trunc the NewOp to the original result.
2556   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2557   MI.eraseFromParent();
2558   return Legalized;
2559 }
2560 
2561 LegalizerHelper::LegalizeResult
2562 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2563                                          LLT WideTy) {
2564   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2565                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2566                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2567   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2568                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
2569   // We can convert this to:
2570   //   1. Any extend iN to iM
2571   //   2. SHL by M-N
2572   //   3. [US][ADD|SUB|SHL]SAT
2573   //   4. L/ASHR by M-N
2574   //
2575   // It may be more efficient to lower this to a min and a max operation in
2576   // the higher precision arithmetic if the promoted operation isn't legal,
2577   // but this decision is up to the target's lowering request.
2578   Register DstReg = MI.getOperand(0).getReg();
2579 
2580   unsigned NewBits = WideTy.getScalarSizeInBits();
2581   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2582 
2583   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2584   // must not left shift the RHS to preserve the shift amount.
2585   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2586   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2587                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2588   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2589   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2590   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2591 
2592   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2593                                         {ShiftL, ShiftR}, MI.getFlags());
2594 
2595   // Use a shift that will preserve the number of sign bits when the trunc is
2596   // folded away.
2597   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2598                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2599 
2600   MIRBuilder.buildTrunc(DstReg, Result);
2601   MI.eraseFromParent();
2602   return Legalized;
2603 }
2604 
2605 LegalizerHelper::LegalizeResult
2606 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2607                                  LLT WideTy) {
2608   if (TypeIdx == 1) {
2609     Observer.changingInstr(MI);
2610     widenScalarDst(MI, WideTy, 1);
2611     Observer.changedInstr(MI);
2612     return Legalized;
2613   }
2614 
2615   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2616   auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2617   LLT SrcTy = MRI.getType(LHS);
2618   LLT OverflowTy = MRI.getType(OriginalOverflow);
2619   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2620 
2621   // To determine if the result overflowed in the larger type, we extend the
2622   // input to the larger type, do the multiply (checking if it overflows),
2623   // then also check the high bits of the result to see if overflow happened
2624   // there.
2625   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2626   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2627   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2628 
2629   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2630   // so we don't need to check the overflow result of larger type Mulo.
2631   bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2632 
2633   unsigned MulOpc =
2634       WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2635 
2636   MachineInstrBuilder Mulo;
2637   if (WideMulCanOverflow)
2638     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2639                                  {LeftOperand, RightOperand});
2640   else
2641     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2642 
2643   auto Mul = Mulo->getOperand(0);
2644   MIRBuilder.buildTrunc(Result, Mul);
2645 
2646   MachineInstrBuilder ExtResult;
2647   // Overflow occurred if it occurred in the larger type, or if the high part
2648   // of the result does not zero/sign-extend the low part.  Check this second
2649   // possibility first.
2650   if (IsSigned) {
2651     // For signed, overflow occurred when the high part does not sign-extend
2652     // the low part.
2653     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2654   } else {
2655     // Unsigned overflow occurred when the high part does not zero-extend the
2656     // low part.
2657     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2658   }
2659 
2660   if (WideMulCanOverflow) {
2661     auto Overflow =
2662         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2663     // Finally check if the multiplication in the larger type itself overflowed.
2664     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2665   } else {
2666     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2667   }
2668   MI.eraseFromParent();
2669   return Legalized;
2670 }
2671 
2672 LegalizerHelper::LegalizeResult
2673 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2674   unsigned Opcode = MI.getOpcode();
2675   switch (Opcode) {
2676   default:
2677     return UnableToLegalize;
2678   case TargetOpcode::G_ATOMICRMW_XCHG:
2679   case TargetOpcode::G_ATOMICRMW_ADD:
2680   case TargetOpcode::G_ATOMICRMW_SUB:
2681   case TargetOpcode::G_ATOMICRMW_AND:
2682   case TargetOpcode::G_ATOMICRMW_OR:
2683   case TargetOpcode::G_ATOMICRMW_XOR:
2684   case TargetOpcode::G_ATOMICRMW_MIN:
2685   case TargetOpcode::G_ATOMICRMW_MAX:
2686   case TargetOpcode::G_ATOMICRMW_UMIN:
2687   case TargetOpcode::G_ATOMICRMW_UMAX:
2688     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2689     Observer.changingInstr(MI);
2690     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2691     widenScalarDst(MI, WideTy, 0);
2692     Observer.changedInstr(MI);
2693     return Legalized;
2694   case TargetOpcode::G_ATOMIC_CMPXCHG:
2695     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2696     Observer.changingInstr(MI);
2697     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2698     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2699     widenScalarDst(MI, WideTy, 0);
2700     Observer.changedInstr(MI);
2701     return Legalized;
2702   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2703     if (TypeIdx == 0) {
2704       Observer.changingInstr(MI);
2705       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2706       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2707       widenScalarDst(MI, WideTy, 0);
2708       Observer.changedInstr(MI);
2709       return Legalized;
2710     }
2711     assert(TypeIdx == 1 &&
2712            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2713     Observer.changingInstr(MI);
2714     widenScalarDst(MI, WideTy, 1);
2715     Observer.changedInstr(MI);
2716     return Legalized;
2717   case TargetOpcode::G_EXTRACT:
2718     return widenScalarExtract(MI, TypeIdx, WideTy);
2719   case TargetOpcode::G_INSERT:
2720     return widenScalarInsert(MI, TypeIdx, WideTy);
2721   case TargetOpcode::G_MERGE_VALUES:
2722     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2723   case TargetOpcode::G_UNMERGE_VALUES:
2724     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2725   case TargetOpcode::G_SADDO:
2726   case TargetOpcode::G_SSUBO:
2727   case TargetOpcode::G_UADDO:
2728   case TargetOpcode::G_USUBO:
2729   case TargetOpcode::G_SADDE:
2730   case TargetOpcode::G_SSUBE:
2731   case TargetOpcode::G_UADDE:
2732   case TargetOpcode::G_USUBE:
2733     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2734   case TargetOpcode::G_UMULO:
2735   case TargetOpcode::G_SMULO:
2736     return widenScalarMulo(MI, TypeIdx, WideTy);
2737   case TargetOpcode::G_SADDSAT:
2738   case TargetOpcode::G_SSUBSAT:
2739   case TargetOpcode::G_SSHLSAT:
2740   case TargetOpcode::G_UADDSAT:
2741   case TargetOpcode::G_USUBSAT:
2742   case TargetOpcode::G_USHLSAT:
2743     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2744   case TargetOpcode::G_CTTZ:
2745   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2746   case TargetOpcode::G_CTLZ:
2747   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2748   case TargetOpcode::G_CTPOP: {
2749     if (TypeIdx == 0) {
2750       Observer.changingInstr(MI);
2751       widenScalarDst(MI, WideTy, 0);
2752       Observer.changedInstr(MI);
2753       return Legalized;
2754     }
2755 
2756     Register SrcReg = MI.getOperand(1).getReg();
2757 
2758     // First extend the input.
2759     unsigned ExtOpc = Opcode == TargetOpcode::G_CTTZ ||
2760                               Opcode == TargetOpcode::G_CTTZ_ZERO_UNDEF
2761                           ? TargetOpcode::G_ANYEXT
2762                           : TargetOpcode::G_ZEXT;
2763     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2764     LLT CurTy = MRI.getType(SrcReg);
2765     unsigned NewOpc = Opcode;
2766     if (NewOpc == TargetOpcode::G_CTTZ) {
2767       // The count is the same in the larger type except if the original
2768       // value was zero.  This can be handled by setting the bit just off
2769       // the top of the original type.
2770       auto TopBit =
2771           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2772       MIBSrc = MIRBuilder.buildOr(
2773         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2774       // Now we know the operand is non-zero, use the more relaxed opcode.
2775       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2776     }
2777 
2778     unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2779 
2780     if (Opcode == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2781       // An optimization where the result is the CTLZ after the left shift by
2782       // (Difference in widety and current ty), that is,
2783       // MIBSrc = MIBSrc << (sizeinbits(WideTy) - sizeinbits(CurTy))
2784       // Result = ctlz MIBSrc
2785       MIBSrc = MIRBuilder.buildShl(WideTy, MIBSrc,
2786                                    MIRBuilder.buildConstant(WideTy, SizeDiff));
2787     }
2788 
2789     // Perform the operation at the larger size.
2790     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2791     // This is already the correct result for CTPOP and CTTZs
2792     if (Opcode == TargetOpcode::G_CTLZ) {
2793       // The correct result is NewOp - (Difference in widety and current ty).
2794       MIBNewOp = MIRBuilder.buildSub(
2795           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2796     }
2797 
2798     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2799     MI.eraseFromParent();
2800     return Legalized;
2801   }
2802   case TargetOpcode::G_BSWAP: {
2803     Observer.changingInstr(MI);
2804     Register DstReg = MI.getOperand(0).getReg();
2805 
2806     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2807     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2808     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2809     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2810 
2811     MI.getOperand(0).setReg(DstExt);
2812 
2813     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2814 
2815     LLT Ty = MRI.getType(DstReg);
2816     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2817     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2818     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2819 
2820     MIRBuilder.buildTrunc(DstReg, ShrReg);
2821     Observer.changedInstr(MI);
2822     return Legalized;
2823   }
2824   case TargetOpcode::G_BITREVERSE: {
2825     Observer.changingInstr(MI);
2826 
2827     Register DstReg = MI.getOperand(0).getReg();
2828     LLT Ty = MRI.getType(DstReg);
2829     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2830 
2831     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2832     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2833     MI.getOperand(0).setReg(DstExt);
2834     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2835 
2836     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2837     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2838     MIRBuilder.buildTrunc(DstReg, Shift);
2839     Observer.changedInstr(MI);
2840     return Legalized;
2841   }
2842   case TargetOpcode::G_FREEZE:
2843   case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
2844     Observer.changingInstr(MI);
2845     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2846     widenScalarDst(MI, WideTy);
2847     Observer.changedInstr(MI);
2848     return Legalized;
2849 
2850   case TargetOpcode::G_ABS:
2851     Observer.changingInstr(MI);
2852     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2853     widenScalarDst(MI, WideTy);
2854     Observer.changedInstr(MI);
2855     return Legalized;
2856 
2857   case TargetOpcode::G_ADD:
2858   case TargetOpcode::G_AND:
2859   case TargetOpcode::G_MUL:
2860   case TargetOpcode::G_OR:
2861   case TargetOpcode::G_XOR:
2862   case TargetOpcode::G_SUB:
2863   case TargetOpcode::G_SHUFFLE_VECTOR:
2864     // Perform operation at larger width (any extension is fines here, high bits
2865     // don't affect the result) and then truncate the result back to the
2866     // original type.
2867     Observer.changingInstr(MI);
2868     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2869     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2870     widenScalarDst(MI, WideTy);
2871     Observer.changedInstr(MI);
2872     return Legalized;
2873 
2874   case TargetOpcode::G_SBFX:
2875   case TargetOpcode::G_UBFX:
2876     Observer.changingInstr(MI);
2877 
2878     if (TypeIdx == 0) {
2879       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2880       widenScalarDst(MI, WideTy);
2881     } else {
2882       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2883       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2884     }
2885 
2886     Observer.changedInstr(MI);
2887     return Legalized;
2888 
2889   case TargetOpcode::G_SHL:
2890     Observer.changingInstr(MI);
2891 
2892     if (TypeIdx == 0) {
2893       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2894       widenScalarDst(MI, WideTy);
2895     } else {
2896       assert(TypeIdx == 1);
2897       // The "number of bits to shift" operand must preserve its value as an
2898       // unsigned integer:
2899       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2900     }
2901 
2902     Observer.changedInstr(MI);
2903     return Legalized;
2904 
2905   case TargetOpcode::G_ROTR:
2906   case TargetOpcode::G_ROTL:
2907     if (TypeIdx != 1)
2908       return UnableToLegalize;
2909 
2910     Observer.changingInstr(MI);
2911     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2912     Observer.changedInstr(MI);
2913     return Legalized;
2914 
2915   case TargetOpcode::G_SDIV:
2916   case TargetOpcode::G_SREM:
2917   case TargetOpcode::G_SMIN:
2918   case TargetOpcode::G_SMAX:
2919     Observer.changingInstr(MI);
2920     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2921     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2922     widenScalarDst(MI, WideTy);
2923     Observer.changedInstr(MI);
2924     return Legalized;
2925 
2926   case TargetOpcode::G_SDIVREM:
2927     Observer.changingInstr(MI);
2928     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2929     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2930     widenScalarDst(MI, WideTy);
2931     widenScalarDst(MI, WideTy, 1);
2932     Observer.changedInstr(MI);
2933     return Legalized;
2934 
2935   case TargetOpcode::G_ASHR:
2936   case TargetOpcode::G_LSHR:
2937     Observer.changingInstr(MI);
2938 
2939     if (TypeIdx == 0) {
2940       unsigned CvtOp = Opcode == TargetOpcode::G_ASHR ? TargetOpcode::G_SEXT
2941                                                       : TargetOpcode::G_ZEXT;
2942 
2943       widenScalarSrc(MI, WideTy, 1, CvtOp);
2944       widenScalarDst(MI, WideTy);
2945     } else {
2946       assert(TypeIdx == 1);
2947       // The "number of bits to shift" operand must preserve its value as an
2948       // unsigned integer:
2949       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2950     }
2951 
2952     Observer.changedInstr(MI);
2953     return Legalized;
2954   case TargetOpcode::G_UDIV:
2955   case TargetOpcode::G_UREM:
2956     Observer.changingInstr(MI);
2957     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2958     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2959     widenScalarDst(MI, WideTy);
2960     Observer.changedInstr(MI);
2961     return Legalized;
2962   case TargetOpcode::G_UDIVREM:
2963     Observer.changingInstr(MI);
2964     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2965     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2966     widenScalarDst(MI, WideTy);
2967     widenScalarDst(MI, WideTy, 1);
2968     Observer.changedInstr(MI);
2969     return Legalized;
2970   case TargetOpcode::G_UMIN:
2971   case TargetOpcode::G_UMAX: {
2972     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2973 
2974     auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
2975     unsigned ExtOpc =
2976         TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(Ty, Ctx),
2977                                   getApproximateEVTForLLT(WideTy, Ctx))
2978             ? TargetOpcode::G_SEXT
2979             : TargetOpcode::G_ZEXT;
2980 
2981     Observer.changingInstr(MI);
2982     widenScalarSrc(MI, WideTy, 1, ExtOpc);
2983     widenScalarSrc(MI, WideTy, 2, ExtOpc);
2984     widenScalarDst(MI, WideTy);
2985     Observer.changedInstr(MI);
2986     return Legalized;
2987   }
2988 
2989   case TargetOpcode::G_SELECT:
2990     Observer.changingInstr(MI);
2991     if (TypeIdx == 0) {
2992       // Perform operation at larger width (any extension is fine here, high
2993       // bits don't affect the result) and then truncate the result back to the
2994       // original type.
2995       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2996       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2997       widenScalarDst(MI, WideTy);
2998     } else {
2999       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
3000       // Explicit extension is required here since high bits affect the result.
3001       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
3002     }
3003     Observer.changedInstr(MI);
3004     return Legalized;
3005 
3006   case TargetOpcode::G_FPTOSI:
3007   case TargetOpcode::G_FPTOUI:
3008   case TargetOpcode::G_INTRINSIC_LRINT:
3009   case TargetOpcode::G_INTRINSIC_LLRINT:
3010   case TargetOpcode::G_IS_FPCLASS:
3011     Observer.changingInstr(MI);
3012 
3013     if (TypeIdx == 0)
3014       widenScalarDst(MI, WideTy);
3015     else
3016       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3017 
3018     Observer.changedInstr(MI);
3019     return Legalized;
3020   case TargetOpcode::G_SITOFP:
3021     Observer.changingInstr(MI);
3022 
3023     if (TypeIdx == 0)
3024       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3025     else
3026       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
3027 
3028     Observer.changedInstr(MI);
3029     return Legalized;
3030   case TargetOpcode::G_UITOFP:
3031     Observer.changingInstr(MI);
3032 
3033     if (TypeIdx == 0)
3034       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3035     else
3036       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
3037 
3038     Observer.changedInstr(MI);
3039     return Legalized;
3040   case TargetOpcode::G_FPTOSI_SAT:
3041   case TargetOpcode::G_FPTOUI_SAT:
3042     Observer.changingInstr(MI);
3043 
3044     if (TypeIdx == 0) {
3045       Register OldDst = MI.getOperand(0).getReg();
3046       LLT Ty = MRI.getType(OldDst);
3047       Register ExtReg = MRI.createGenericVirtualRegister(WideTy);
3048       Register NewDst;
3049       MI.getOperand(0).setReg(ExtReg);
3050       uint64_t ShortBits = Ty.getScalarSizeInBits();
3051       uint64_t WideBits = WideTy.getScalarSizeInBits();
3052       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3053       if (Opcode == TargetOpcode::G_FPTOSI_SAT) {
3054         // z = i16 fptosi_sat(a)
3055         // ->
3056         // x = i32 fptosi_sat(a)
3057         // y = smin(x, 32767)
3058         // z = smax(y, -32768)
3059         auto MaxVal = MIRBuilder.buildConstant(
3060             WideTy, APInt::getSignedMaxValue(ShortBits).sext(WideBits));
3061         auto MinVal = MIRBuilder.buildConstant(
3062             WideTy, APInt::getSignedMinValue(ShortBits).sext(WideBits));
3063         Register MidReg =
3064             MIRBuilder.buildSMin(WideTy, ExtReg, MaxVal).getReg(0);
3065         NewDst = MIRBuilder.buildSMax(WideTy, MidReg, MinVal).getReg(0);
3066       } else {
3067         // z = i16 fptoui_sat(a)
3068         // ->
3069         // x = i32 fptoui_sat(a)
3070         // y = smin(x, 65535)
3071         auto MaxVal = MIRBuilder.buildConstant(
3072             WideTy, APInt::getAllOnes(ShortBits).zext(WideBits));
3073         NewDst = MIRBuilder.buildUMin(WideTy, ExtReg, MaxVal).getReg(0);
3074       }
3075       MIRBuilder.buildTrunc(OldDst, NewDst);
3076     } else
3077       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3078 
3079     Observer.changedInstr(MI);
3080     return Legalized;
3081   case TargetOpcode::G_LOAD:
3082   case TargetOpcode::G_SEXTLOAD:
3083   case TargetOpcode::G_ZEXTLOAD:
3084     Observer.changingInstr(MI);
3085     widenScalarDst(MI, WideTy);
3086     Observer.changedInstr(MI);
3087     return Legalized;
3088 
3089   case TargetOpcode::G_STORE: {
3090     if (TypeIdx != 0)
3091       return UnableToLegalize;
3092 
3093     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3094     assert(!Ty.isPointerOrPointerVector() && "Can't widen type");
3095     if (!Ty.isScalar()) {
3096       // We need to widen the vector element type.
3097       Observer.changingInstr(MI);
3098       widenScalarSrc(MI, WideTy, 0, TargetOpcode::G_ANYEXT);
3099       // We also need to adjust the MMO to turn this into a truncating store.
3100       MachineMemOperand &MMO = **MI.memoperands_begin();
3101       MachineFunction &MF = MIRBuilder.getMF();
3102       auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), Ty);
3103       MI.setMemRefs(MF, {NewMMO});
3104       Observer.changedInstr(MI);
3105       return Legalized;
3106     }
3107 
3108     Observer.changingInstr(MI);
3109 
3110     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
3111       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
3112     widenScalarSrc(MI, WideTy, 0, ExtType);
3113 
3114     Observer.changedInstr(MI);
3115     return Legalized;
3116   }
3117   case TargetOpcode::G_CONSTANT: {
3118     MachineOperand &SrcMO = MI.getOperand(1);
3119     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3120     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
3121         MRI.getType(MI.getOperand(0).getReg()));
3122     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
3123             ExtOpc == TargetOpcode::G_ANYEXT) &&
3124            "Illegal Extend");
3125     const APInt &SrcVal = SrcMO.getCImm()->getValue();
3126     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
3127                            ? SrcVal.sext(WideTy.getSizeInBits())
3128                            : SrcVal.zext(WideTy.getSizeInBits());
3129     Observer.changingInstr(MI);
3130     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3131 
3132     widenScalarDst(MI, WideTy);
3133     Observer.changedInstr(MI);
3134     return Legalized;
3135   }
3136   case TargetOpcode::G_FCONSTANT: {
3137     // To avoid changing the bits of the constant due to extension to a larger
3138     // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
3139     MachineOperand &SrcMO = MI.getOperand(1);
3140     APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
3141     MIRBuilder.setInstrAndDebugLoc(MI);
3142     auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
3143     widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
3144     MI.eraseFromParent();
3145     return Legalized;
3146   }
3147   case TargetOpcode::G_IMPLICIT_DEF: {
3148     Observer.changingInstr(MI);
3149     widenScalarDst(MI, WideTy);
3150     Observer.changedInstr(MI);
3151     return Legalized;
3152   }
3153   case TargetOpcode::G_BRCOND:
3154     Observer.changingInstr(MI);
3155     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
3156     Observer.changedInstr(MI);
3157     return Legalized;
3158 
3159   case TargetOpcode::G_FCMP:
3160     Observer.changingInstr(MI);
3161     if (TypeIdx == 0)
3162       widenScalarDst(MI, WideTy);
3163     else {
3164       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3165       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
3166     }
3167     Observer.changedInstr(MI);
3168     return Legalized;
3169 
3170   case TargetOpcode::G_ICMP:
3171     Observer.changingInstr(MI);
3172     if (TypeIdx == 0)
3173       widenScalarDst(MI, WideTy);
3174     else {
3175       LLT SrcTy = MRI.getType(MI.getOperand(2).getReg());
3176       CmpInst::Predicate Pred =
3177           static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3178 
3179       auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
3180       unsigned ExtOpcode =
3181           (CmpInst::isSigned(Pred) ||
3182            TLI.isSExtCheaperThanZExt(getApproximateEVTForLLT(SrcTy, Ctx),
3183                                      getApproximateEVTForLLT(WideTy, Ctx)))
3184               ? TargetOpcode::G_SEXT
3185               : TargetOpcode::G_ZEXT;
3186       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
3187       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
3188     }
3189     Observer.changedInstr(MI);
3190     return Legalized;
3191 
3192   case TargetOpcode::G_PTR_ADD:
3193     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
3194     Observer.changingInstr(MI);
3195     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3196     Observer.changedInstr(MI);
3197     return Legalized;
3198 
3199   case TargetOpcode::G_PHI: {
3200     assert(TypeIdx == 0 && "Expecting only Idx 0");
3201 
3202     Observer.changingInstr(MI);
3203     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
3204       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
3205       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
3206       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
3207     }
3208 
3209     MachineBasicBlock &MBB = *MI.getParent();
3210     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
3211     widenScalarDst(MI, WideTy);
3212     Observer.changedInstr(MI);
3213     return Legalized;
3214   }
3215   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
3216     if (TypeIdx == 0) {
3217       Register VecReg = MI.getOperand(1).getReg();
3218       LLT VecTy = MRI.getType(VecReg);
3219       Observer.changingInstr(MI);
3220 
3221       widenScalarSrc(
3222           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
3223           TargetOpcode::G_ANYEXT);
3224 
3225       widenScalarDst(MI, WideTy, 0);
3226       Observer.changedInstr(MI);
3227       return Legalized;
3228     }
3229 
3230     if (TypeIdx != 2)
3231       return UnableToLegalize;
3232     Observer.changingInstr(MI);
3233     // TODO: Probably should be zext
3234     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3235     Observer.changedInstr(MI);
3236     return Legalized;
3237   }
3238   case TargetOpcode::G_INSERT_VECTOR_ELT: {
3239     if (TypeIdx == 0) {
3240       Observer.changingInstr(MI);
3241       const LLT WideEltTy = WideTy.getElementType();
3242 
3243       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3244       widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
3245       widenScalarDst(MI, WideTy, 0);
3246       Observer.changedInstr(MI);
3247       return Legalized;
3248     }
3249 
3250     if (TypeIdx == 1) {
3251       Observer.changingInstr(MI);
3252 
3253       Register VecReg = MI.getOperand(1).getReg();
3254       LLT VecTy = MRI.getType(VecReg);
3255       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
3256 
3257       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
3258       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
3259       widenScalarDst(MI, WideVecTy, 0);
3260       Observer.changedInstr(MI);
3261       return Legalized;
3262     }
3263 
3264     if (TypeIdx == 2) {
3265       Observer.changingInstr(MI);
3266       // TODO: Probably should be zext
3267       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
3268       Observer.changedInstr(MI);
3269       return Legalized;
3270     }
3271 
3272     return UnableToLegalize;
3273   }
3274   case TargetOpcode::G_FADD:
3275   case TargetOpcode::G_FMUL:
3276   case TargetOpcode::G_FSUB:
3277   case TargetOpcode::G_FMA:
3278   case TargetOpcode::G_FMAD:
3279   case TargetOpcode::G_FNEG:
3280   case TargetOpcode::G_FABS:
3281   case TargetOpcode::G_FCANONICALIZE:
3282   case TargetOpcode::G_FMINNUM:
3283   case TargetOpcode::G_FMAXNUM:
3284   case TargetOpcode::G_FMINNUM_IEEE:
3285   case TargetOpcode::G_FMAXNUM_IEEE:
3286   case TargetOpcode::G_FMINIMUM:
3287   case TargetOpcode::G_FMAXIMUM:
3288   case TargetOpcode::G_FMINIMUMNUM:
3289   case TargetOpcode::G_FMAXIMUMNUM:
3290   case TargetOpcode::G_FDIV:
3291   case TargetOpcode::G_FREM:
3292   case TargetOpcode::G_FCEIL:
3293   case TargetOpcode::G_FFLOOR:
3294   case TargetOpcode::G_FCOS:
3295   case TargetOpcode::G_FSIN:
3296   case TargetOpcode::G_FTAN:
3297   case TargetOpcode::G_FACOS:
3298   case TargetOpcode::G_FASIN:
3299   case TargetOpcode::G_FATAN:
3300   case TargetOpcode::G_FATAN2:
3301   case TargetOpcode::G_FCOSH:
3302   case TargetOpcode::G_FSINH:
3303   case TargetOpcode::G_FTANH:
3304   case TargetOpcode::G_FLOG10:
3305   case TargetOpcode::G_FLOG:
3306   case TargetOpcode::G_FLOG2:
3307   case TargetOpcode::G_FRINT:
3308   case TargetOpcode::G_FNEARBYINT:
3309   case TargetOpcode::G_FSQRT:
3310   case TargetOpcode::G_FEXP:
3311   case TargetOpcode::G_FEXP2:
3312   case TargetOpcode::G_FEXP10:
3313   case TargetOpcode::G_FPOW:
3314   case TargetOpcode::G_INTRINSIC_TRUNC:
3315   case TargetOpcode::G_INTRINSIC_ROUND:
3316   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
3317     assert(TypeIdx == 0);
3318     Observer.changingInstr(MI);
3319 
3320     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3321       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
3322 
3323     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3324     Observer.changedInstr(MI);
3325     return Legalized;
3326   case TargetOpcode::G_FPOWI:
3327   case TargetOpcode::G_FLDEXP:
3328   case TargetOpcode::G_STRICT_FLDEXP: {
3329     if (TypeIdx == 0) {
3330       if (Opcode == TargetOpcode::G_STRICT_FLDEXP)
3331         return UnableToLegalize;
3332 
3333       Observer.changingInstr(MI);
3334       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
3335       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3336       Observer.changedInstr(MI);
3337       return Legalized;
3338     }
3339 
3340     if (TypeIdx == 1) {
3341       // For some reason SelectionDAG tries to promote to a libcall without
3342       // actually changing the integer type for promotion.
3343       Observer.changingInstr(MI);
3344       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
3345       Observer.changedInstr(MI);
3346       return Legalized;
3347     }
3348 
3349     return UnableToLegalize;
3350   }
3351   case TargetOpcode::G_FFREXP: {
3352     Observer.changingInstr(MI);
3353 
3354     if (TypeIdx == 0) {
3355       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
3356       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3357     } else {
3358       widenScalarDst(MI, WideTy, 1);
3359     }
3360 
3361     Observer.changedInstr(MI);
3362     return Legalized;
3363   }
3364   case TargetOpcode::G_INTTOPTR:
3365     if (TypeIdx != 1)
3366       return UnableToLegalize;
3367 
3368     Observer.changingInstr(MI);
3369     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
3370     Observer.changedInstr(MI);
3371     return Legalized;
3372   case TargetOpcode::G_PTRTOINT:
3373     if (TypeIdx != 0)
3374       return UnableToLegalize;
3375 
3376     Observer.changingInstr(MI);
3377     widenScalarDst(MI, WideTy, 0);
3378     Observer.changedInstr(MI);
3379     return Legalized;
3380   case TargetOpcode::G_BUILD_VECTOR: {
3381     Observer.changingInstr(MI);
3382 
3383     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
3384     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
3385       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
3386 
3387     // Avoid changing the result vector type if the source element type was
3388     // requested.
3389     if (TypeIdx == 1) {
3390       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
3391     } else {
3392       widenScalarDst(MI, WideTy, 0);
3393     }
3394 
3395     Observer.changedInstr(MI);
3396     return Legalized;
3397   }
3398   case TargetOpcode::G_SEXT_INREG:
3399     if (TypeIdx != 0)
3400       return UnableToLegalize;
3401 
3402     Observer.changingInstr(MI);
3403     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3404     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
3405     Observer.changedInstr(MI);
3406     return Legalized;
3407   case TargetOpcode::G_PTRMASK: {
3408     if (TypeIdx != 1)
3409       return UnableToLegalize;
3410     Observer.changingInstr(MI);
3411     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
3412     Observer.changedInstr(MI);
3413     return Legalized;
3414   }
3415   case TargetOpcode::G_VECREDUCE_ADD: {
3416     if (TypeIdx != 1)
3417       return UnableToLegalize;
3418     Observer.changingInstr(MI);
3419     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3420     widenScalarDst(MI, WideTy.getScalarType(), 0, TargetOpcode::G_TRUNC);
3421     Observer.changedInstr(MI);
3422     return Legalized;
3423   }
3424   case TargetOpcode::G_VECREDUCE_FADD:
3425   case TargetOpcode::G_VECREDUCE_FMUL:
3426   case TargetOpcode::G_VECREDUCE_FMIN:
3427   case TargetOpcode::G_VECREDUCE_FMAX:
3428   case TargetOpcode::G_VECREDUCE_FMINIMUM:
3429   case TargetOpcode::G_VECREDUCE_FMAXIMUM: {
3430     if (TypeIdx != 0)
3431       return UnableToLegalize;
3432     Observer.changingInstr(MI);
3433     Register VecReg = MI.getOperand(1).getReg();
3434     LLT VecTy = MRI.getType(VecReg);
3435     LLT WideVecTy = VecTy.isVector()
3436                         ? LLT::vector(VecTy.getElementCount(), WideTy)
3437                         : WideTy;
3438     widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
3439     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
3440     Observer.changedInstr(MI);
3441     return Legalized;
3442   }
3443   case TargetOpcode::G_VSCALE: {
3444     MachineOperand &SrcMO = MI.getOperand(1);
3445     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
3446     const APInt &SrcVal = SrcMO.getCImm()->getValue();
3447     // The CImm is always a signed value
3448     const APInt Val = SrcVal.sext(WideTy.getSizeInBits());
3449     Observer.changingInstr(MI);
3450     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
3451     widenScalarDst(MI, WideTy);
3452     Observer.changedInstr(MI);
3453     return Legalized;
3454   }
3455   case TargetOpcode::G_SPLAT_VECTOR: {
3456     if (TypeIdx != 1)
3457       return UnableToLegalize;
3458 
3459     Observer.changingInstr(MI);
3460     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
3461     Observer.changedInstr(MI);
3462     return Legalized;
3463   }
3464   case TargetOpcode::G_INSERT_SUBVECTOR: {
3465     if (TypeIdx != 0)
3466       return UnableToLegalize;
3467 
3468     GInsertSubvector &IS = cast<GInsertSubvector>(MI);
3469     Register BigVec = IS.getBigVec();
3470     Register SubVec = IS.getSubVec();
3471 
3472     LLT SubVecTy = MRI.getType(SubVec);
3473     LLT SubVecWideTy = SubVecTy.changeElementType(WideTy.getElementType());
3474 
3475     // Widen the G_INSERT_SUBVECTOR
3476     auto BigZExt = MIRBuilder.buildZExt(WideTy, BigVec);
3477     auto SubZExt = MIRBuilder.buildZExt(SubVecWideTy, SubVec);
3478     auto WideInsert = MIRBuilder.buildInsertSubvector(WideTy, BigZExt, SubZExt,
3479                                                       IS.getIndexImm());
3480 
3481     // Truncate back down
3482     auto SplatZero = MIRBuilder.buildSplatVector(
3483         WideTy, MIRBuilder.buildConstant(WideTy.getElementType(), 0));
3484     MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, IS.getReg(0), WideInsert,
3485                          SplatZero);
3486 
3487     MI.eraseFromParent();
3488 
3489     return Legalized;
3490   }
3491   }
3492 }
3493 
3494 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
3495                              MachineIRBuilder &B, Register Src, LLT Ty) {
3496   auto Unmerge = B.buildUnmerge(Ty, Src);
3497   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3498     Pieces.push_back(Unmerge.getReg(I));
3499 }
3500 
3501 static void emitLoadFromConstantPool(Register DstReg, const Constant *ConstVal,
3502                                      MachineIRBuilder &MIRBuilder) {
3503   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3504   MachineFunction &MF = MIRBuilder.getMF();
3505   const DataLayout &DL = MIRBuilder.getDataLayout();
3506   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
3507   LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3508   LLT DstLLT = MRI.getType(DstReg);
3509 
3510   Align Alignment(DL.getABITypeAlign(ConstVal->getType()));
3511 
3512   auto Addr = MIRBuilder.buildConstantPool(
3513       AddrPtrTy,
3514       MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment));
3515 
3516   MachineMemOperand *MMO =
3517       MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
3518                               MachineMemOperand::MOLoad, DstLLT, Alignment);
3519 
3520   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, DstReg, Addr, *MMO);
3521 }
3522 
3523 LegalizerHelper::LegalizeResult
3524 LegalizerHelper::lowerConstant(MachineInstr &MI) {
3525   const MachineOperand &ConstOperand = MI.getOperand(1);
3526   const Constant *ConstantVal = ConstOperand.getCImm();
3527 
3528   emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3529   MI.eraseFromParent();
3530 
3531   return Legalized;
3532 }
3533 
3534 LegalizerHelper::LegalizeResult
3535 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
3536   const MachineOperand &ConstOperand = MI.getOperand(1);
3537   const Constant *ConstantVal = ConstOperand.getFPImm();
3538 
3539   emitLoadFromConstantPool(MI.getOperand(0).getReg(), ConstantVal, MIRBuilder);
3540   MI.eraseFromParent();
3541 
3542   return Legalized;
3543 }
3544 
3545 LegalizerHelper::LegalizeResult
3546 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3547   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3548   if (SrcTy.isVector()) {
3549     LLT SrcEltTy = SrcTy.getElementType();
3550     SmallVector<Register, 8> SrcRegs;
3551 
3552     if (DstTy.isVector()) {
3553       int NumDstElt = DstTy.getNumElements();
3554       int NumSrcElt = SrcTy.getNumElements();
3555 
3556       LLT DstEltTy = DstTy.getElementType();
3557       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3558       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3559 
3560       // If there's an element size mismatch, insert intermediate casts to match
3561       // the result element type.
3562       if (NumSrcElt < NumDstElt) { // Source element type is larger.
3563         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3564         //
3565         // =>
3566         //
3567         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3568         // %3:_(<2 x s8>) = G_BITCAST %2
3569         // %4:_(<2 x s8>) = G_BITCAST %3
3570         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3571         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3572         SrcPartTy = SrcEltTy;
3573       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3574         //
3575         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3576         //
3577         // =>
3578         //
3579         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3580         // %3:_(s16) = G_BITCAST %2
3581         // %4:_(s16) = G_BITCAST %3
3582         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3583         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3584         DstCastTy = DstEltTy;
3585       }
3586 
3587       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3588       for (Register &SrcReg : SrcRegs)
3589         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3590     } else
3591       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3592 
3593     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3594     MI.eraseFromParent();
3595     return Legalized;
3596   }
3597 
3598   if (DstTy.isVector()) {
3599     SmallVector<Register, 8> SrcRegs;
3600     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3601     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3602     MI.eraseFromParent();
3603     return Legalized;
3604   }
3605 
3606   return UnableToLegalize;
3607 }
3608 
3609 /// Figure out the bit offset into a register when coercing a vector index for
3610 /// the wide element type. This is only for the case when promoting vector to
3611 /// one with larger elements.
3612 //
3613 ///
3614 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3615 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3616 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3617                                                    Register Idx,
3618                                                    unsigned NewEltSize,
3619                                                    unsigned OldEltSize) {
3620   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3621   LLT IdxTy = B.getMRI()->getType(Idx);
3622 
3623   // Now figure out the amount we need to shift to get the target bits.
3624   auto OffsetMask = B.buildConstant(
3625       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3626   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3627   return B.buildShl(IdxTy, OffsetIdx,
3628                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3629 }
3630 
3631 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3632 /// is casting to a vector with a smaller element size, perform multiple element
3633 /// extracts and merge the results. If this is coercing to a vector with larger
3634 /// elements, index the bitcasted vector and extract the target element with bit
3635 /// operations. This is intended to force the indexing in the native register
3636 /// size for architectures that can dynamically index the register file.
3637 LegalizerHelper::LegalizeResult
3638 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3639                                          LLT CastTy) {
3640   if (TypeIdx != 1)
3641     return UnableToLegalize;
3642 
3643   auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3644 
3645   LLT SrcEltTy = SrcVecTy.getElementType();
3646   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3647   unsigned OldNumElts = SrcVecTy.getNumElements();
3648 
3649   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3650   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3651 
3652   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3653   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3654   if (NewNumElts > OldNumElts) {
3655     // Decreasing the vector element size
3656     //
3657     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3658     //  =>
3659     //  v4i32:castx = bitcast x:v2i64
3660     //
3661     // i64 = bitcast
3662     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3663     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
3664     //
3665     if (NewNumElts % OldNumElts != 0)
3666       return UnableToLegalize;
3667 
3668     // Type of the intermediate result vector.
3669     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3670     LLT MidTy =
3671         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3672 
3673     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3674 
3675     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3676     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3677 
3678     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3679       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3680       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3681       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3682       NewOps[I] = Elt.getReg(0);
3683     }
3684 
3685     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3686     MIRBuilder.buildBitcast(Dst, NewVec);
3687     MI.eraseFromParent();
3688     return Legalized;
3689   }
3690 
3691   if (NewNumElts < OldNumElts) {
3692     if (NewEltSize % OldEltSize != 0)
3693       return UnableToLegalize;
3694 
3695     // This only depends on powers of 2 because we use bit tricks to figure out
3696     // the bit offset we need to shift to get the target element. A general
3697     // expansion could emit division/multiply.
3698     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3699       return UnableToLegalize;
3700 
3701     // Increasing the vector element size.
3702     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3703     //
3704     //   =>
3705     //
3706     // %cast = G_BITCAST %vec
3707     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3708     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3709     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3710     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3711     // %elt_bits = G_LSHR %wide_elt, %offset_bits
3712     // %elt = G_TRUNC %elt_bits
3713 
3714     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3715     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3716 
3717     // Divide to get the index in the wider element type.
3718     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3719 
3720     Register WideElt = CastVec;
3721     if (CastTy.isVector()) {
3722       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3723                                                      ScaledIdx).getReg(0);
3724     }
3725 
3726     // Compute the bit offset into the register of the target element.
3727     Register OffsetBits = getBitcastWiderVectorElementOffset(
3728       MIRBuilder, Idx, NewEltSize, OldEltSize);
3729 
3730     // Shift the wide element to get the target element.
3731     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3732     MIRBuilder.buildTrunc(Dst, ExtractedBits);
3733     MI.eraseFromParent();
3734     return Legalized;
3735   }
3736 
3737   return UnableToLegalize;
3738 }
3739 
3740 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3741 /// TargetReg, while preserving other bits in \p TargetReg.
3742 ///
3743 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3744 static Register buildBitFieldInsert(MachineIRBuilder &B,
3745                                     Register TargetReg, Register InsertReg,
3746                                     Register OffsetBits) {
3747   LLT TargetTy = B.getMRI()->getType(TargetReg);
3748   LLT InsertTy = B.getMRI()->getType(InsertReg);
3749   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3750   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3751 
3752   // Produce a bitmask of the value to insert
3753   auto EltMask = B.buildConstant(
3754     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3755                                    InsertTy.getSizeInBits()));
3756   // Shift it into position
3757   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3758   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3759 
3760   // Clear out the bits in the wide element
3761   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3762 
3763   // The value to insert has all zeros already, so stick it into the masked
3764   // wide element.
3765   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3766 }
3767 
3768 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3769 /// is increasing the element size, perform the indexing in the target element
3770 /// type, and use bit operations to insert at the element position. This is
3771 /// intended for architectures that can dynamically index the register file and
3772 /// want to force indexing in the native register size.
3773 LegalizerHelper::LegalizeResult
3774 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3775                                         LLT CastTy) {
3776   if (TypeIdx != 0)
3777     return UnableToLegalize;
3778 
3779   auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3780       MI.getFirst4RegLLTs();
3781   LLT VecTy = DstTy;
3782 
3783   LLT VecEltTy = VecTy.getElementType();
3784   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3785   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3786   const unsigned OldEltSize = VecEltTy.getSizeInBits();
3787 
3788   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3789   unsigned OldNumElts = VecTy.getNumElements();
3790 
3791   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3792   if (NewNumElts < OldNumElts) {
3793     if (NewEltSize % OldEltSize != 0)
3794       return UnableToLegalize;
3795 
3796     // This only depends on powers of 2 because we use bit tricks to figure out
3797     // the bit offset we need to shift to get the target element. A general
3798     // expansion could emit division/multiply.
3799     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3800       return UnableToLegalize;
3801 
3802     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3803     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3804 
3805     // Divide to get the index in the wider element type.
3806     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3807 
3808     Register ExtractedElt = CastVec;
3809     if (CastTy.isVector()) {
3810       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3811                                                           ScaledIdx).getReg(0);
3812     }
3813 
3814     // Compute the bit offset into the register of the target element.
3815     Register OffsetBits = getBitcastWiderVectorElementOffset(
3816       MIRBuilder, Idx, NewEltSize, OldEltSize);
3817 
3818     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3819                                                Val, OffsetBits);
3820     if (CastTy.isVector()) {
3821       InsertedElt = MIRBuilder.buildInsertVectorElement(
3822         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3823     }
3824 
3825     MIRBuilder.buildBitcast(Dst, InsertedElt);
3826     MI.eraseFromParent();
3827     return Legalized;
3828   }
3829 
3830   return UnableToLegalize;
3831 }
3832 
3833 // This attempts to handle G_CONCAT_VECTORS with illegal operands, particularly
3834 // those that have smaller than legal operands.
3835 //
3836 // <16 x s8> = G_CONCAT_VECTORS <4 x s8>, <4 x s8>, <4 x s8>, <4 x s8>
3837 //
3838 // ===>
3839 //
3840 // s32 = G_BITCAST <4 x s8>
3841 // s32 = G_BITCAST <4 x s8>
3842 // s32 = G_BITCAST <4 x s8>
3843 // s32 = G_BITCAST <4 x s8>
3844 // <4 x s32> = G_BUILD_VECTOR s32, s32, s32, s32
3845 // <16 x s8> = G_BITCAST <4 x s32>
3846 LegalizerHelper::LegalizeResult
3847 LegalizerHelper::bitcastConcatVector(MachineInstr &MI, unsigned TypeIdx,
3848                                      LLT CastTy) {
3849   // Convert it to CONCAT instruction
3850   auto ConcatMI = dyn_cast<GConcatVectors>(&MI);
3851   if (!ConcatMI) {
3852     return UnableToLegalize;
3853   }
3854 
3855   // Check if bitcast is Legal
3856   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
3857   LLT SrcScalTy = LLT::scalar(SrcTy.getSizeInBits());
3858 
3859   // Check if the build vector is Legal
3860   if (!LI.isLegal({TargetOpcode::G_BUILD_VECTOR, {CastTy, SrcScalTy}})) {
3861     return UnableToLegalize;
3862   }
3863 
3864   // Bitcast the sources
3865   SmallVector<Register> BitcastRegs;
3866   for (unsigned i = 0; i < ConcatMI->getNumSources(); i++) {
3867     BitcastRegs.push_back(
3868         MIRBuilder.buildBitcast(SrcScalTy, ConcatMI->getSourceReg(i))
3869             .getReg(0));
3870   }
3871 
3872   // Build the scalar values into a vector
3873   Register BuildReg =
3874       MIRBuilder.buildBuildVector(CastTy, BitcastRegs).getReg(0);
3875   MIRBuilder.buildBitcast(DstReg, BuildReg);
3876 
3877   MI.eraseFromParent();
3878   return Legalized;
3879 }
3880 
3881 // This bitcasts a shuffle vector to a different type currently of the same
3882 // element size. Mostly used to legalize ptr vectors, where ptrtoint/inttoptr
3883 // will be used instead.
3884 //
3885 // <16 x p0> = G_CONCAT_VECTORS <4 x p0>, <4 x p0>, mask
3886 // ===>
3887 // <4 x s64> = G_PTRTOINT <4 x p0>
3888 // <4 x s64> = G_PTRTOINT <4 x p0>
3889 // <16 x s64> = G_CONCAT_VECTORS <4 x s64>, <4 x s64>, mask
3890 // <16 x p0> = G_INTTOPTR <16 x s64>
3891 LegalizerHelper::LegalizeResult
3892 LegalizerHelper::bitcastShuffleVector(MachineInstr &MI, unsigned TypeIdx,
3893                                       LLT CastTy) {
3894   auto ShuffleMI = cast<GShuffleVector>(&MI);
3895   LLT DstTy = MRI.getType(ShuffleMI->getReg(0));
3896   LLT SrcTy = MRI.getType(ShuffleMI->getReg(1));
3897 
3898   // We currently only handle vectors of the same size.
3899   if (TypeIdx != 0 ||
3900       CastTy.getScalarSizeInBits() != DstTy.getScalarSizeInBits() ||
3901       CastTy.getElementCount() != DstTy.getElementCount())
3902     return UnableToLegalize;
3903 
3904   LLT NewSrcTy = SrcTy.changeElementType(CastTy.getScalarType());
3905 
3906   auto Inp1 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(1));
3907   auto Inp2 = MIRBuilder.buildCast(NewSrcTy, ShuffleMI->getReg(2));
3908   auto Shuf =
3909       MIRBuilder.buildShuffleVector(CastTy, Inp1, Inp2, ShuffleMI->getMask());
3910   MIRBuilder.buildCast(ShuffleMI->getReg(0), Shuf);
3911 
3912   MI.eraseFromParent();
3913   return Legalized;
3914 }
3915 
3916 /// This attempts to bitcast G_EXTRACT_SUBVECTOR to CastTy.
3917 ///
3918 ///  <vscale x 8 x i1> = G_EXTRACT_SUBVECTOR <vscale x 16 x i1>, N
3919 ///
3920 /// ===>
3921 ///
3922 ///  <vscale x 2 x i1> = G_BITCAST <vscale x 16 x i1>
3923 ///  <vscale x 1 x i8> = G_EXTRACT_SUBVECTOR <vscale x 2 x i1>, N / 8
3924 ///  <vscale x 8 x i1> = G_BITCAST <vscale x 1 x i8>
3925 LegalizerHelper::LegalizeResult
3926 LegalizerHelper::bitcastExtractSubvector(MachineInstr &MI, unsigned TypeIdx,
3927                                          LLT CastTy) {
3928   auto ES = cast<GExtractSubvector>(&MI);
3929 
3930   if (!CastTy.isVector())
3931     return UnableToLegalize;
3932 
3933   if (TypeIdx != 0)
3934     return UnableToLegalize;
3935 
3936   Register Dst = ES->getReg(0);
3937   Register Src = ES->getSrcVec();
3938   uint64_t Idx = ES->getIndexImm();
3939 
3940   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3941 
3942   LLT DstTy = MRI.getType(Dst);
3943   LLT SrcTy = MRI.getType(Src);
3944   ElementCount DstTyEC = DstTy.getElementCount();
3945   ElementCount SrcTyEC = SrcTy.getElementCount();
3946   auto DstTyMinElts = DstTyEC.getKnownMinValue();
3947   auto SrcTyMinElts = SrcTyEC.getKnownMinValue();
3948 
3949   if (DstTy == CastTy)
3950     return Legalized;
3951 
3952   if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
3953     return UnableToLegalize;
3954 
3955   unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
3956   unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
3957   if (CastEltSize < DstEltSize)
3958     return UnableToLegalize;
3959 
3960   auto AdjustAmt = CastEltSize / DstEltSize;
3961   if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
3962       SrcTyMinElts % AdjustAmt != 0)
3963     return UnableToLegalize;
3964 
3965   Idx /= AdjustAmt;
3966   SrcTy = LLT::vector(SrcTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
3967   auto CastVec = MIRBuilder.buildBitcast(SrcTy, Src);
3968   auto PromotedES = MIRBuilder.buildExtractSubvector(CastTy, CastVec, Idx);
3969   MIRBuilder.buildBitcast(Dst, PromotedES);
3970 
3971   ES->eraseFromParent();
3972   return Legalized;
3973 }
3974 
3975 /// This attempts to bitcast G_INSERT_SUBVECTOR to CastTy.
3976 ///
3977 ///  <vscale x 16 x i1> = G_INSERT_SUBVECTOR <vscale x 16 x i1>,
3978 ///                                          <vscale x 8 x i1>,
3979 ///                                          N
3980 ///
3981 /// ===>
3982 ///
3983 ///  <vscale x 2 x i8> = G_BITCAST <vscale x 16 x i1>
3984 ///  <vscale x 1 x i8> = G_BITCAST <vscale x 8 x i1>
3985 ///  <vscale x 2 x i8> = G_INSERT_SUBVECTOR <vscale x 2 x i8>,
3986 ///                                         <vscale x 1 x i8>, N / 8
3987 ///  <vscale x 16 x i1> = G_BITCAST <vscale x 2 x i8>
3988 LegalizerHelper::LegalizeResult
3989 LegalizerHelper::bitcastInsertSubvector(MachineInstr &MI, unsigned TypeIdx,
3990                                         LLT CastTy) {
3991   auto ES = cast<GInsertSubvector>(&MI);
3992 
3993   if (!CastTy.isVector())
3994     return UnableToLegalize;
3995 
3996   if (TypeIdx != 0)
3997     return UnableToLegalize;
3998 
3999   Register Dst = ES->getReg(0);
4000   Register BigVec = ES->getBigVec();
4001   Register SubVec = ES->getSubVec();
4002   uint64_t Idx = ES->getIndexImm();
4003 
4004   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4005 
4006   LLT DstTy = MRI.getType(Dst);
4007   LLT BigVecTy = MRI.getType(BigVec);
4008   LLT SubVecTy = MRI.getType(SubVec);
4009 
4010   if (DstTy == CastTy)
4011     return Legalized;
4012 
4013   if (DstTy.getSizeInBits() != CastTy.getSizeInBits())
4014     return UnableToLegalize;
4015 
4016   ElementCount DstTyEC = DstTy.getElementCount();
4017   ElementCount BigVecTyEC = BigVecTy.getElementCount();
4018   ElementCount SubVecTyEC = SubVecTy.getElementCount();
4019   auto DstTyMinElts = DstTyEC.getKnownMinValue();
4020   auto BigVecTyMinElts = BigVecTyEC.getKnownMinValue();
4021   auto SubVecTyMinElts = SubVecTyEC.getKnownMinValue();
4022 
4023   unsigned CastEltSize = CastTy.getElementType().getSizeInBits();
4024   unsigned DstEltSize = DstTy.getElementType().getSizeInBits();
4025   if (CastEltSize < DstEltSize)
4026     return UnableToLegalize;
4027 
4028   auto AdjustAmt = CastEltSize / DstEltSize;
4029   if (Idx % AdjustAmt != 0 || DstTyMinElts % AdjustAmt != 0 ||
4030       BigVecTyMinElts % AdjustAmt != 0 || SubVecTyMinElts % AdjustAmt != 0)
4031     return UnableToLegalize;
4032 
4033   Idx /= AdjustAmt;
4034   BigVecTy = LLT::vector(BigVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
4035   SubVecTy = LLT::vector(SubVecTyEC.divideCoefficientBy(AdjustAmt), AdjustAmt);
4036   auto CastBigVec = MIRBuilder.buildBitcast(BigVecTy, BigVec);
4037   auto CastSubVec = MIRBuilder.buildBitcast(SubVecTy, SubVec);
4038   auto PromotedIS =
4039       MIRBuilder.buildInsertSubvector(CastTy, CastBigVec, CastSubVec, Idx);
4040   MIRBuilder.buildBitcast(Dst, PromotedIS);
4041 
4042   ES->eraseFromParent();
4043   return Legalized;
4044 }
4045 
4046 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
4047   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
4048   Register DstReg = LoadMI.getDstReg();
4049   Register PtrReg = LoadMI.getPointerReg();
4050   LLT DstTy = MRI.getType(DstReg);
4051   MachineMemOperand &MMO = LoadMI.getMMO();
4052   LLT MemTy = MMO.getMemoryType();
4053   MachineFunction &MF = MIRBuilder.getMF();
4054 
4055   unsigned MemSizeInBits = MemTy.getSizeInBits();
4056   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
4057 
4058   if (MemSizeInBits != MemStoreSizeInBits) {
4059     if (MemTy.isVector())
4060       return UnableToLegalize;
4061 
4062     // Promote to a byte-sized load if not loading an integral number of
4063     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
4064     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
4065     MachineMemOperand *NewMMO =
4066         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
4067 
4068     Register LoadReg = DstReg;
4069     LLT LoadTy = DstTy;
4070 
4071     // If this wasn't already an extending load, we need to widen the result
4072     // register to avoid creating a load with a narrower result than the source.
4073     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
4074       LoadTy = WideMemTy;
4075       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
4076     }
4077 
4078     if (isa<GSExtLoad>(LoadMI)) {
4079       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4080       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
4081     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
4082       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
4083       // The extra bits are guaranteed to be zero, since we stored them that
4084       // way.  A zext load from Wide thus automatically gives zext from MemVT.
4085       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
4086     } else {
4087       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
4088     }
4089 
4090     if (DstTy != LoadTy)
4091       MIRBuilder.buildTrunc(DstReg, LoadReg);
4092 
4093     LoadMI.eraseFromParent();
4094     return Legalized;
4095   }
4096 
4097   // Big endian lowering not implemented.
4098   if (MIRBuilder.getDataLayout().isBigEndian())
4099     return UnableToLegalize;
4100 
4101   // This load needs splitting into power of 2 sized loads.
4102   //
4103   // Our strategy here is to generate anyextending loads for the smaller
4104   // types up to next power-2 result type, and then combine the two larger
4105   // result values together, before truncating back down to the non-pow-2
4106   // type.
4107   // E.g. v1 = i24 load =>
4108   // v2 = i32 zextload (2 byte)
4109   // v3 = i32 load (1 byte)
4110   // v4 = i32 shl v3, 16
4111   // v5 = i32 or v4, v2
4112   // v1 = i24 trunc v5
4113   // By doing this we generate the correct truncate which should get
4114   // combined away as an artifact with a matching extend.
4115 
4116   uint64_t LargeSplitSize, SmallSplitSize;
4117 
4118   if (!isPowerOf2_32(MemSizeInBits)) {
4119     // This load needs splitting into power of 2 sized loads.
4120     LargeSplitSize = llvm::bit_floor(MemSizeInBits);
4121     SmallSplitSize = MemSizeInBits - LargeSplitSize;
4122   } else {
4123     // This is already a power of 2, but we still need to split this in half.
4124     //
4125     // Assume we're being asked to decompose an unaligned load.
4126     // TODO: If this requires multiple splits, handle them all at once.
4127     auto &Ctx = MF.getFunction().getContext();
4128     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4129       return UnableToLegalize;
4130 
4131     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4132   }
4133 
4134   if (MemTy.isVector()) {
4135     // TODO: Handle vector extloads
4136     if (MemTy != DstTy)
4137       return UnableToLegalize;
4138 
4139     Align Alignment = LoadMI.getAlign();
4140     // Given an alignment larger than the size of the memory, we can increase
4141     // the size of the load without needing to scalarize it.
4142     if (Alignment.value() * 8 > MemSizeInBits &&
4143         isPowerOf2_64(DstTy.getScalarSizeInBits())) {
4144       LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
4145                                      DstTy.getElementType());
4146       MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
4147       auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
4148       MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
4149                                                    NewLoad.getReg(0));
4150       LoadMI.eraseFromParent();
4151       return Legalized;
4152     }
4153 
4154     // TODO: We can do better than scalarizing the vector and at least split it
4155     // in half.
4156     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
4157   }
4158 
4159   MachineMemOperand *LargeMMO =
4160       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4161   MachineMemOperand *SmallMMO =
4162       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4163 
4164   LLT PtrTy = MRI.getType(PtrReg);
4165   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
4166   LLT AnyExtTy = LLT::scalar(AnyExtSize);
4167   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
4168                                              PtrReg, *LargeMMO);
4169 
4170   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
4171                                             LargeSplitSize / 8);
4172   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
4173   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
4174   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
4175                                              SmallPtr, *SmallMMO);
4176 
4177   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
4178   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
4179 
4180   if (AnyExtTy == DstTy)
4181     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
4182   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
4183     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4184     MIRBuilder.buildTrunc(DstReg, {Or});
4185   } else {
4186     assert(DstTy.isPointer() && "expected pointer");
4187     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
4188 
4189     // FIXME: We currently consider this to be illegal for non-integral address
4190     // spaces, but we need still need a way to reinterpret the bits.
4191     MIRBuilder.buildIntToPtr(DstReg, Or);
4192   }
4193 
4194   LoadMI.eraseFromParent();
4195   return Legalized;
4196 }
4197 
4198 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
4199   // Lower a non-power of 2 store into multiple pow-2 stores.
4200   // E.g. split an i24 store into an i16 store + i8 store.
4201   // We do this by first extending the stored value to the next largest power
4202   // of 2 type, and then using truncating stores to store the components.
4203   // By doing this, likewise with G_LOAD, generate an extend that can be
4204   // artifact-combined away instead of leaving behind extracts.
4205   Register SrcReg = StoreMI.getValueReg();
4206   Register PtrReg = StoreMI.getPointerReg();
4207   LLT SrcTy = MRI.getType(SrcReg);
4208   MachineFunction &MF = MIRBuilder.getMF();
4209   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4210   LLT MemTy = MMO.getMemoryType();
4211 
4212   unsigned StoreWidth = MemTy.getSizeInBits();
4213   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
4214 
4215   if (StoreWidth != StoreSizeInBits && !SrcTy.isVector()) {
4216     // Promote to a byte-sized store with upper bits zero if not
4217     // storing an integral number of bytes.  For example, promote
4218     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
4219     LLT WideTy = LLT::scalar(StoreSizeInBits);
4220 
4221     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
4222       // Avoid creating a store with a narrower source than result.
4223       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
4224       SrcTy = WideTy;
4225     }
4226 
4227     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
4228 
4229     MachineMemOperand *NewMMO =
4230         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
4231     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
4232     StoreMI.eraseFromParent();
4233     return Legalized;
4234   }
4235 
4236   if (MemTy.isVector()) {
4237     if (MemTy != SrcTy)
4238       return scalarizeVectorBooleanStore(StoreMI);
4239 
4240     // TODO: We can do better than scalarizing the vector and at least split it
4241     // in half.
4242     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
4243   }
4244 
4245   unsigned MemSizeInBits = MemTy.getSizeInBits();
4246   uint64_t LargeSplitSize, SmallSplitSize;
4247 
4248   if (!isPowerOf2_32(MemSizeInBits)) {
4249     LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
4250     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
4251   } else {
4252     auto &Ctx = MF.getFunction().getContext();
4253     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
4254       return UnableToLegalize; // Don't know what we're being asked to do.
4255 
4256     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
4257   }
4258 
4259   // Extend to the next pow-2. If this store was itself the result of lowering,
4260   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
4261   // that's wider than the stored size.
4262   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
4263   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
4264 
4265   if (SrcTy.isPointer()) {
4266     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
4267     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
4268   }
4269 
4270   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
4271 
4272   // Obtain the smaller value by shifting away the larger value.
4273   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
4274   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
4275 
4276   // Generate the PtrAdd and truncating stores.
4277   LLT PtrTy = MRI.getType(PtrReg);
4278   auto OffsetCst = MIRBuilder.buildConstant(
4279     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
4280   auto SmallPtr =
4281     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
4282 
4283   MachineMemOperand *LargeMMO =
4284     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
4285   MachineMemOperand *SmallMMO =
4286     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
4287   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
4288   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
4289   StoreMI.eraseFromParent();
4290   return Legalized;
4291 }
4292 
4293 LegalizerHelper::LegalizeResult
4294 LegalizerHelper::scalarizeVectorBooleanStore(GStore &StoreMI) {
4295   Register SrcReg = StoreMI.getValueReg();
4296   Register PtrReg = StoreMI.getPointerReg();
4297   LLT SrcTy = MRI.getType(SrcReg);
4298   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
4299   LLT MemTy = MMO.getMemoryType();
4300   LLT MemScalarTy = MemTy.getElementType();
4301   MachineFunction &MF = MIRBuilder.getMF();
4302 
4303   assert(SrcTy.isVector() && "Expect a vector store type");
4304 
4305   if (!MemScalarTy.isByteSized()) {
4306     // We need to build an integer scalar of the vector bit pattern.
4307     // It's not legal for us to add padding when storing a vector.
4308     unsigned NumBits = MemTy.getSizeInBits();
4309     LLT IntTy = LLT::scalar(NumBits);
4310     auto CurrVal = MIRBuilder.buildConstant(IntTy, 0);
4311     LLT IdxTy = TLI.getVectorIdxLLT(MF.getDataLayout());
4312 
4313     for (unsigned I = 0, E = MemTy.getNumElements(); I < E; ++I) {
4314       auto Elt = MIRBuilder.buildExtractVectorElement(
4315           SrcTy.getElementType(), SrcReg, MIRBuilder.buildConstant(IdxTy, I));
4316       auto Trunc = MIRBuilder.buildTrunc(MemScalarTy, Elt);
4317       auto ZExt = MIRBuilder.buildZExt(IntTy, Trunc);
4318       unsigned ShiftIntoIdx = MF.getDataLayout().isBigEndian()
4319                                   ? (MemTy.getNumElements() - 1) - I
4320                                   : I;
4321       auto ShiftAmt = MIRBuilder.buildConstant(
4322           IntTy, ShiftIntoIdx * MemScalarTy.getSizeInBits());
4323       auto Shifted = MIRBuilder.buildShl(IntTy, ZExt, ShiftAmt);
4324       CurrVal = MIRBuilder.buildOr(IntTy, CurrVal, Shifted);
4325     }
4326     auto PtrInfo = MMO.getPointerInfo();
4327     auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, IntTy);
4328     MIRBuilder.buildStore(CurrVal, PtrReg, *NewMMO);
4329     StoreMI.eraseFromParent();
4330     return Legalized;
4331   }
4332 
4333   // TODO: implement simple scalarization.
4334   return UnableToLegalize;
4335 }
4336 
4337 LegalizerHelper::LegalizeResult
4338 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
4339   switch (MI.getOpcode()) {
4340   case TargetOpcode::G_LOAD: {
4341     if (TypeIdx != 0)
4342       return UnableToLegalize;
4343     MachineMemOperand &MMO = **MI.memoperands_begin();
4344 
4345     // Not sure how to interpret a bitcast of an extending load.
4346     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4347       return UnableToLegalize;
4348 
4349     Observer.changingInstr(MI);
4350     bitcastDst(MI, CastTy, 0);
4351     MMO.setType(CastTy);
4352     // The range metadata is no longer valid when reinterpreted as a different
4353     // type.
4354     MMO.clearRanges();
4355     Observer.changedInstr(MI);
4356     return Legalized;
4357   }
4358   case TargetOpcode::G_STORE: {
4359     if (TypeIdx != 0)
4360       return UnableToLegalize;
4361 
4362     MachineMemOperand &MMO = **MI.memoperands_begin();
4363 
4364     // Not sure how to interpret a bitcast of a truncating store.
4365     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
4366       return UnableToLegalize;
4367 
4368     Observer.changingInstr(MI);
4369     bitcastSrc(MI, CastTy, 0);
4370     MMO.setType(CastTy);
4371     Observer.changedInstr(MI);
4372     return Legalized;
4373   }
4374   case TargetOpcode::G_SELECT: {
4375     if (TypeIdx != 0)
4376       return UnableToLegalize;
4377 
4378     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
4379       LLVM_DEBUG(
4380           dbgs() << "bitcast action not implemented for vector select\n");
4381       return UnableToLegalize;
4382     }
4383 
4384     Observer.changingInstr(MI);
4385     bitcastSrc(MI, CastTy, 2);
4386     bitcastSrc(MI, CastTy, 3);
4387     bitcastDst(MI, CastTy, 0);
4388     Observer.changedInstr(MI);
4389     return Legalized;
4390   }
4391   case TargetOpcode::G_AND:
4392   case TargetOpcode::G_OR:
4393   case TargetOpcode::G_XOR: {
4394     Observer.changingInstr(MI);
4395     bitcastSrc(MI, CastTy, 1);
4396     bitcastSrc(MI, CastTy, 2);
4397     bitcastDst(MI, CastTy, 0);
4398     Observer.changedInstr(MI);
4399     return Legalized;
4400   }
4401   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4402     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
4403   case TargetOpcode::G_INSERT_VECTOR_ELT:
4404     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
4405   case TargetOpcode::G_CONCAT_VECTORS:
4406     return bitcastConcatVector(MI, TypeIdx, CastTy);
4407   case TargetOpcode::G_SHUFFLE_VECTOR:
4408     return bitcastShuffleVector(MI, TypeIdx, CastTy);
4409   case TargetOpcode::G_EXTRACT_SUBVECTOR:
4410     return bitcastExtractSubvector(MI, TypeIdx, CastTy);
4411   case TargetOpcode::G_INSERT_SUBVECTOR:
4412     return bitcastInsertSubvector(MI, TypeIdx, CastTy);
4413   default:
4414     return UnableToLegalize;
4415   }
4416 }
4417 
4418 // Legalize an instruction by changing the opcode in place.
4419 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
4420     Observer.changingInstr(MI);
4421     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
4422     Observer.changedInstr(MI);
4423 }
4424 
4425 LegalizerHelper::LegalizeResult
4426 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
4427   using namespace TargetOpcode;
4428 
4429   switch(MI.getOpcode()) {
4430   default:
4431     return UnableToLegalize;
4432   case TargetOpcode::G_FCONSTANT:
4433     return lowerFConstant(MI);
4434   case TargetOpcode::G_BITCAST:
4435     return lowerBitcast(MI);
4436   case TargetOpcode::G_SREM:
4437   case TargetOpcode::G_UREM: {
4438     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4439     auto Quot =
4440         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
4441                               {MI.getOperand(1), MI.getOperand(2)});
4442 
4443     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
4444     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
4445     MI.eraseFromParent();
4446     return Legalized;
4447   }
4448   case TargetOpcode::G_SADDO:
4449   case TargetOpcode::G_SSUBO:
4450     return lowerSADDO_SSUBO(MI);
4451   case TargetOpcode::G_UMULH:
4452   case TargetOpcode::G_SMULH:
4453     return lowerSMULH_UMULH(MI);
4454   case TargetOpcode::G_SMULO:
4455   case TargetOpcode::G_UMULO: {
4456     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
4457     // result.
4458     auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
4459     LLT Ty = MRI.getType(Res);
4460 
4461     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
4462                           ? TargetOpcode::G_SMULH
4463                           : TargetOpcode::G_UMULH;
4464 
4465     Observer.changingInstr(MI);
4466     const auto &TII = MIRBuilder.getTII();
4467     MI.setDesc(TII.get(TargetOpcode::G_MUL));
4468     MI.removeOperand(1);
4469     Observer.changedInstr(MI);
4470 
4471     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
4472     auto Zero = MIRBuilder.buildConstant(Ty, 0);
4473 
4474     // Move insert point forward so we can use the Res register if needed.
4475     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
4476 
4477     // For *signed* multiply, overflow is detected by checking:
4478     // (hi != (lo >> bitwidth-1))
4479     if (Opcode == TargetOpcode::G_SMULH) {
4480       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
4481       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
4482       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
4483     } else {
4484       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
4485     }
4486     return Legalized;
4487   }
4488   case TargetOpcode::G_FNEG: {
4489     auto [Res, SubByReg] = MI.getFirst2Regs();
4490     LLT Ty = MRI.getType(Res);
4491 
4492     auto SignMask = MIRBuilder.buildConstant(
4493         Ty, APInt::getSignMask(Ty.getScalarSizeInBits()));
4494     MIRBuilder.buildXor(Res, SubByReg, SignMask);
4495     MI.eraseFromParent();
4496     return Legalized;
4497   }
4498   case TargetOpcode::G_FSUB:
4499   case TargetOpcode::G_STRICT_FSUB: {
4500     auto [Res, LHS, RHS] = MI.getFirst3Regs();
4501     LLT Ty = MRI.getType(Res);
4502 
4503     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
4504     auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
4505 
4506     if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
4507       MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
4508     else
4509       MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
4510 
4511     MI.eraseFromParent();
4512     return Legalized;
4513   }
4514   case TargetOpcode::G_FMAD:
4515     return lowerFMad(MI);
4516   case TargetOpcode::G_FFLOOR:
4517     return lowerFFloor(MI);
4518   case TargetOpcode::G_LROUND:
4519   case TargetOpcode::G_LLROUND: {
4520     Register DstReg = MI.getOperand(0).getReg();
4521     Register SrcReg = MI.getOperand(1).getReg();
4522     LLT SrcTy = MRI.getType(SrcReg);
4523     auto Round = MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_ROUND, {SrcTy},
4524                                        {SrcReg});
4525     MIRBuilder.buildFPTOSI(DstReg, Round);
4526     MI.eraseFromParent();
4527     return Legalized;
4528   }
4529   case TargetOpcode::G_INTRINSIC_ROUND:
4530     return lowerIntrinsicRound(MI);
4531   case TargetOpcode::G_FRINT: {
4532     // Since round even is the assumed rounding mode for unconstrained FP
4533     // operations, rint and roundeven are the same operation.
4534     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
4535     return Legalized;
4536   }
4537   case TargetOpcode::G_INTRINSIC_LRINT:
4538   case TargetOpcode::G_INTRINSIC_LLRINT: {
4539     Register DstReg = MI.getOperand(0).getReg();
4540     Register SrcReg = MI.getOperand(1).getReg();
4541     LLT SrcTy = MRI.getType(SrcReg);
4542     auto Round =
4543         MIRBuilder.buildInstr(TargetOpcode::G_FRINT, {SrcTy}, {SrcReg});
4544     MIRBuilder.buildFPTOSI(DstReg, Round);
4545     MI.eraseFromParent();
4546     return Legalized;
4547   }
4548   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
4549     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
4550     Register NewOldValRes = MRI.cloneVirtualRegister(OldValRes);
4551     MIRBuilder.buildAtomicCmpXchg(NewOldValRes, Addr, CmpVal, NewVal,
4552                                   **MI.memoperands_begin());
4553     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, NewOldValRes, CmpVal);
4554     MIRBuilder.buildCopy(OldValRes, NewOldValRes);
4555     MI.eraseFromParent();
4556     return Legalized;
4557   }
4558   case TargetOpcode::G_LOAD:
4559   case TargetOpcode::G_SEXTLOAD:
4560   case TargetOpcode::G_ZEXTLOAD:
4561     return lowerLoad(cast<GAnyLoad>(MI));
4562   case TargetOpcode::G_STORE:
4563     return lowerStore(cast<GStore>(MI));
4564   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
4565   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
4566   case TargetOpcode::G_CTLZ:
4567   case TargetOpcode::G_CTTZ:
4568   case TargetOpcode::G_CTPOP:
4569     return lowerBitCount(MI);
4570   case G_UADDO: {
4571     auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
4572 
4573     Register NewRes = MRI.cloneVirtualRegister(Res);
4574 
4575     MIRBuilder.buildAdd(NewRes, LHS, RHS);
4576     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, NewRes, RHS);
4577 
4578     MIRBuilder.buildCopy(Res, NewRes);
4579 
4580     MI.eraseFromParent();
4581     return Legalized;
4582   }
4583   case G_UADDE: {
4584     auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
4585     const LLT CondTy = MRI.getType(CarryOut);
4586     const LLT Ty = MRI.getType(Res);
4587 
4588     Register NewRes = MRI.cloneVirtualRegister(Res);
4589 
4590     // Initial add of the two operands.
4591     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
4592 
4593     // Initial check for carry.
4594     auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
4595 
4596     // Add the sum and the carry.
4597     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
4598     MIRBuilder.buildAdd(NewRes, TmpRes, ZExtCarryIn);
4599 
4600     // Second check for carry. We can only carry if the initial sum is all 1s
4601     // and the carry is set, resulting in a new sum of 0.
4602     auto Zero = MIRBuilder.buildConstant(Ty, 0);
4603     auto ResEqZero =
4604         MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, NewRes, Zero);
4605     auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
4606     MIRBuilder.buildOr(CarryOut, Carry, Carry2);
4607 
4608     MIRBuilder.buildCopy(Res, NewRes);
4609 
4610     MI.eraseFromParent();
4611     return Legalized;
4612   }
4613   case G_USUBO: {
4614     auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
4615 
4616     MIRBuilder.buildSub(Res, LHS, RHS);
4617     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
4618 
4619     MI.eraseFromParent();
4620     return Legalized;
4621   }
4622   case G_USUBE: {
4623     auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
4624     const LLT CondTy = MRI.getType(BorrowOut);
4625     const LLT Ty = MRI.getType(Res);
4626 
4627     // Initial subtract of the two operands.
4628     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
4629 
4630     // Initial check for borrow.
4631     auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
4632 
4633     // Subtract the borrow from the first subtract.
4634     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
4635     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
4636 
4637     // Second check for borrow. We can only borrow if the initial difference is
4638     // 0 and the borrow is set, resulting in a new difference of all 1s.
4639     auto Zero = MIRBuilder.buildConstant(Ty, 0);
4640     auto TmpResEqZero =
4641         MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
4642     auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
4643     MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
4644 
4645     MI.eraseFromParent();
4646     return Legalized;
4647   }
4648   case G_UITOFP:
4649     return lowerUITOFP(MI);
4650   case G_SITOFP:
4651     return lowerSITOFP(MI);
4652   case G_FPTOUI:
4653     return lowerFPTOUI(MI);
4654   case G_FPTOSI:
4655     return lowerFPTOSI(MI);
4656   case G_FPTOUI_SAT:
4657   case G_FPTOSI_SAT:
4658     return lowerFPTOINT_SAT(MI);
4659   case G_FPTRUNC:
4660     return lowerFPTRUNC(MI);
4661   case G_FPOWI:
4662     return lowerFPOWI(MI);
4663   case G_SMIN:
4664   case G_SMAX:
4665   case G_UMIN:
4666   case G_UMAX:
4667     return lowerMinMax(MI);
4668   case G_SCMP:
4669   case G_UCMP:
4670     return lowerThreewayCompare(MI);
4671   case G_FCOPYSIGN:
4672     return lowerFCopySign(MI);
4673   case G_FMINNUM:
4674   case G_FMAXNUM:
4675   case G_FMINIMUMNUM:
4676   case G_FMAXIMUMNUM:
4677     return lowerFMinNumMaxNum(MI);
4678   case G_MERGE_VALUES:
4679     return lowerMergeValues(MI);
4680   case G_UNMERGE_VALUES:
4681     return lowerUnmergeValues(MI);
4682   case TargetOpcode::G_SEXT_INREG: {
4683     assert(MI.getOperand(2).isImm() && "Expected immediate");
4684     int64_t SizeInBits = MI.getOperand(2).getImm();
4685 
4686     auto [DstReg, SrcReg] = MI.getFirst2Regs();
4687     LLT DstTy = MRI.getType(DstReg);
4688     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
4689 
4690     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
4691     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
4692     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
4693     MI.eraseFromParent();
4694     return Legalized;
4695   }
4696   case G_EXTRACT_VECTOR_ELT:
4697   case G_INSERT_VECTOR_ELT:
4698     return lowerExtractInsertVectorElt(MI);
4699   case G_SHUFFLE_VECTOR:
4700     return lowerShuffleVector(MI);
4701   case G_VECTOR_COMPRESS:
4702     return lowerVECTOR_COMPRESS(MI);
4703   case G_DYN_STACKALLOC:
4704     return lowerDynStackAlloc(MI);
4705   case G_STACKSAVE:
4706     return lowerStackSave(MI);
4707   case G_STACKRESTORE:
4708     return lowerStackRestore(MI);
4709   case G_EXTRACT:
4710     return lowerExtract(MI);
4711   case G_INSERT:
4712     return lowerInsert(MI);
4713   case G_BSWAP:
4714     return lowerBswap(MI);
4715   case G_BITREVERSE:
4716     return lowerBitreverse(MI);
4717   case G_READ_REGISTER:
4718   case G_WRITE_REGISTER:
4719     return lowerReadWriteRegister(MI);
4720   case G_UADDSAT:
4721   case G_USUBSAT: {
4722     // Try to make a reasonable guess about which lowering strategy to use. The
4723     // target can override this with custom lowering and calling the
4724     // implementation functions.
4725     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4726     if (LI.isLegalOrCustom({G_UMIN, Ty}))
4727       return lowerAddSubSatToMinMax(MI);
4728     return lowerAddSubSatToAddoSubo(MI);
4729   }
4730   case G_SADDSAT:
4731   case G_SSUBSAT: {
4732     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4733 
4734     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
4735     // since it's a shorter expansion. However, we would need to figure out the
4736     // preferred boolean type for the carry out for the query.
4737     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
4738       return lowerAddSubSatToMinMax(MI);
4739     return lowerAddSubSatToAddoSubo(MI);
4740   }
4741   case G_SSHLSAT:
4742   case G_USHLSAT:
4743     return lowerShlSat(MI);
4744   case G_ABS:
4745     return lowerAbsToAddXor(MI);
4746   case G_FABS:
4747     return lowerFAbs(MI);
4748   case G_SELECT:
4749     return lowerSelect(MI);
4750   case G_IS_FPCLASS:
4751     return lowerISFPCLASS(MI);
4752   case G_SDIVREM:
4753   case G_UDIVREM:
4754     return lowerDIVREM(MI);
4755   case G_FSHL:
4756   case G_FSHR:
4757     return lowerFunnelShift(MI);
4758   case G_ROTL:
4759   case G_ROTR:
4760     return lowerRotate(MI);
4761   case G_MEMSET:
4762   case G_MEMCPY:
4763   case G_MEMMOVE:
4764     return lowerMemCpyFamily(MI);
4765   case G_MEMCPY_INLINE:
4766     return lowerMemcpyInline(MI);
4767   case G_ZEXT:
4768   case G_SEXT:
4769   case G_ANYEXT:
4770     return lowerEXT(MI);
4771   case G_TRUNC:
4772     return lowerTRUNC(MI);
4773   GISEL_VECREDUCE_CASES_NONSEQ
4774     return lowerVectorReduction(MI);
4775   case G_VAARG:
4776     return lowerVAArg(MI);
4777   }
4778 }
4779 
4780 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
4781                                                   Align MinAlign) const {
4782   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
4783   // datalayout for the preferred alignment. Also there should be a target hook
4784   // for this to allow targets to reduce the alignment and ignore the
4785   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
4786   // the type.
4787   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
4788 }
4789 
4790 MachineInstrBuilder
4791 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
4792                                       MachinePointerInfo &PtrInfo) {
4793   MachineFunction &MF = MIRBuilder.getMF();
4794   const DataLayout &DL = MIRBuilder.getDataLayout();
4795   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
4796 
4797   unsigned AddrSpace = DL.getAllocaAddrSpace();
4798   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
4799 
4800   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
4801   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
4802 }
4803 
4804 MachineInstrBuilder LegalizerHelper::createStackStoreLoad(const DstOp &Res,
4805                                                           const SrcOp &Val) {
4806   LLT SrcTy = Val.getLLTTy(MRI);
4807   Align StackTypeAlign =
4808       std::max(getStackTemporaryAlignment(SrcTy),
4809                getStackTemporaryAlignment(Res.getLLTTy(MRI)));
4810   MachinePointerInfo PtrInfo;
4811   auto StackTemp =
4812       createStackTemporary(SrcTy.getSizeInBytes(), StackTypeAlign, PtrInfo);
4813 
4814   MIRBuilder.buildStore(Val, StackTemp, PtrInfo, StackTypeAlign);
4815   return MIRBuilder.buildLoad(Res, StackTemp, PtrInfo, StackTypeAlign);
4816 }
4817 
4818 static Register clampVectorIndex(MachineIRBuilder &B, Register IdxReg,
4819                                  LLT VecTy) {
4820   LLT IdxTy = B.getMRI()->getType(IdxReg);
4821   unsigned NElts = VecTy.getNumElements();
4822 
4823   int64_t IdxVal;
4824   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal))) {
4825     if (IdxVal < VecTy.getNumElements())
4826       return IdxReg;
4827     // If a constant index would be out of bounds, clamp it as well.
4828   }
4829 
4830   if (isPowerOf2_32(NElts)) {
4831     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
4832     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
4833   }
4834 
4835   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
4836       .getReg(0);
4837 }
4838 
4839 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
4840                                                   Register Index) {
4841   LLT EltTy = VecTy.getElementType();
4842 
4843   // Calculate the element offset and add it to the pointer.
4844   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
4845   assert(EltSize * 8 == EltTy.getSizeInBits() &&
4846          "Converting bits to bytes lost precision");
4847 
4848   Index = clampVectorIndex(MIRBuilder, Index, VecTy);
4849 
4850   // Convert index to the correct size for the address space.
4851   const DataLayout &DL = MIRBuilder.getDataLayout();
4852   unsigned AS = MRI.getType(VecPtr).getAddressSpace();
4853   unsigned IndexSizeInBits = DL.getIndexSize(AS) * 8;
4854   LLT IdxTy = MRI.getType(Index).changeElementSize(IndexSizeInBits);
4855   if (IdxTy != MRI.getType(Index))
4856     Index = MIRBuilder.buildSExtOrTrunc(IdxTy, Index).getReg(0);
4857 
4858   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
4859                                  MIRBuilder.buildConstant(IdxTy, EltSize));
4860 
4861   LLT PtrTy = MRI.getType(VecPtr);
4862   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
4863 }
4864 
4865 #ifndef NDEBUG
4866 /// Check that all vector operands have same number of elements. Other operands
4867 /// should be listed in NonVecOp.
4868 static bool hasSameNumEltsOnAllVectorOperands(
4869     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
4870     std::initializer_list<unsigned> NonVecOpIndices) {
4871   if (MI.getNumMemOperands() != 0)
4872     return false;
4873 
4874   LLT VecTy = MRI.getType(MI.getReg(0));
4875   if (!VecTy.isVector())
4876     return false;
4877   unsigned NumElts = VecTy.getNumElements();
4878 
4879   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4880     MachineOperand &Op = MI.getOperand(OpIdx);
4881     if (!Op.isReg()) {
4882       if (!is_contained(NonVecOpIndices, OpIdx))
4883         return false;
4884       continue;
4885     }
4886 
4887     LLT Ty = MRI.getType(Op.getReg());
4888     if (!Ty.isVector()) {
4889       if (!is_contained(NonVecOpIndices, OpIdx))
4890         return false;
4891       continue;
4892     }
4893 
4894     if (Ty.getNumElements() != NumElts)
4895       return false;
4896   }
4897 
4898   return true;
4899 }
4900 #endif
4901 
4902 /// Fill \p DstOps with DstOps that have same number of elements combined as
4903 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4904 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4905 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4906 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4907                        unsigned NumElts) {
4908   LLT LeftoverTy;
4909   assert(Ty.isVector() && "Expected vector type");
4910   LLT EltTy = Ty.getElementType();
4911   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4912   int NumParts, NumLeftover;
4913   std::tie(NumParts, NumLeftover) =
4914       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4915 
4916   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4917   for (int i = 0; i < NumParts; ++i) {
4918     DstOps.push_back(NarrowTy);
4919   }
4920 
4921   if (LeftoverTy.isValid()) {
4922     assert(NumLeftover == 1 && "expected exactly one leftover");
4923     DstOps.push_back(LeftoverTy);
4924   }
4925 }
4926 
4927 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4928 /// made from \p Op depending on operand type.
4929 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4930                            MachineOperand &Op) {
4931   for (unsigned i = 0; i < N; ++i) {
4932     if (Op.isReg())
4933       Ops.push_back(Op.getReg());
4934     else if (Op.isImm())
4935       Ops.push_back(Op.getImm());
4936     else if (Op.isPredicate())
4937       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4938     else
4939       llvm_unreachable("Unsupported type");
4940   }
4941 }
4942 
4943 // Handle splitting vector operations which need to have the same number of
4944 // elements in each type index, but each type index may have a different element
4945 // type.
4946 //
4947 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4948 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4949 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4950 //
4951 // Also handles some irregular breakdown cases, e.g.
4952 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4953 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4954 //             s64 = G_SHL s64, s32
4955 LegalizerHelper::LegalizeResult
4956 LegalizerHelper::fewerElementsVectorMultiEltType(
4957     GenericMachineInstr &MI, unsigned NumElts,
4958     std::initializer_list<unsigned> NonVecOpIndices) {
4959   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4960          "Non-compatible opcode or not specified non-vector operands");
4961   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4962 
4963   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4964   unsigned NumDefs = MI.getNumDefs();
4965 
4966   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4967   // Build instructions with DstOps to use instruction found by CSE directly.
4968   // CSE copies found instruction into given vreg when building with vreg dest.
4969   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4970   // Output registers will be taken from created instructions.
4971   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4972   for (unsigned i = 0; i < NumDefs; ++i) {
4973     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4974   }
4975 
4976   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4977   // Operands listed in NonVecOpIndices will be used as is without splitting;
4978   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4979   // scalar condition (op 1), immediate in sext_inreg (op 2).
4980   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4981   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4982        ++UseIdx, ++UseNo) {
4983     if (is_contained(NonVecOpIndices, UseIdx)) {
4984       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4985                      MI.getOperand(UseIdx));
4986     } else {
4987       SmallVector<Register, 8> SplitPieces;
4988       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
4989                          MRI);
4990       llvm::append_range(InputOpsPieces[UseNo], SplitPieces);
4991     }
4992   }
4993 
4994   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4995 
4996   // Take i-th piece of each input operand split and build sub-vector/scalar
4997   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4998   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4999     SmallVector<DstOp, 2> Defs;
5000     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5001       Defs.push_back(OutputOpsPieces[DstNo][i]);
5002 
5003     SmallVector<SrcOp, 3> Uses;
5004     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
5005       Uses.push_back(InputOpsPieces[InputNo][i]);
5006 
5007     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
5008     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
5009       OutputRegs[DstNo].push_back(I.getReg(DstNo));
5010   }
5011 
5012   // Merge small outputs into MI's output for each def operand.
5013   if (NumLeftovers) {
5014     for (unsigned i = 0; i < NumDefs; ++i)
5015       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
5016   } else {
5017     for (unsigned i = 0; i < NumDefs; ++i)
5018       MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
5019   }
5020 
5021   MI.eraseFromParent();
5022   return Legalized;
5023 }
5024 
5025 LegalizerHelper::LegalizeResult
5026 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
5027                                         unsigned NumElts) {
5028   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
5029 
5030   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
5031   unsigned NumDefs = MI.getNumDefs();
5032 
5033   SmallVector<DstOp, 8> OutputOpsPieces;
5034   SmallVector<Register, 8> OutputRegs;
5035   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
5036 
5037   // Instructions that perform register split will be inserted in basic block
5038   // where register is defined (basic block is in the next operand).
5039   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
5040   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
5041        UseIdx += 2, ++UseNo) {
5042     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
5043     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
5044     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
5045                        MIRBuilder, MRI);
5046   }
5047 
5048   // Build PHIs with fewer elements.
5049   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
5050   MIRBuilder.setInsertPt(*MI.getParent(), MI);
5051   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
5052     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
5053     Phi.addDef(
5054         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
5055     OutputRegs.push_back(Phi.getReg(0));
5056 
5057     for (unsigned j = 0; j < NumInputs / 2; ++j) {
5058       Phi.addUse(InputOpsPieces[j][i]);
5059       Phi.add(MI.getOperand(1 + j * 2 + 1));
5060     }
5061   }
5062 
5063   // Set the insert point after the existing PHIs
5064   MachineBasicBlock &MBB = *MI.getParent();
5065   MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
5066 
5067   // Merge small outputs into MI's def.
5068   if (NumLeftovers) {
5069     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
5070   } else {
5071     MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
5072   }
5073 
5074   MI.eraseFromParent();
5075   return Legalized;
5076 }
5077 
5078 LegalizerHelper::LegalizeResult
5079 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
5080                                                   unsigned TypeIdx,
5081                                                   LLT NarrowTy) {
5082   const int NumDst = MI.getNumOperands() - 1;
5083   const Register SrcReg = MI.getOperand(NumDst).getReg();
5084   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
5085   LLT SrcTy = MRI.getType(SrcReg);
5086 
5087   if (TypeIdx != 1 || NarrowTy == DstTy)
5088     return UnableToLegalize;
5089 
5090   // Requires compatible types. Otherwise SrcReg should have been defined by
5091   // merge-like instruction that would get artifact combined. Most likely
5092   // instruction that defines SrcReg has to perform more/fewer elements
5093   // legalization compatible with NarrowTy.
5094   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5095   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5096 
5097   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5098       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
5099     return UnableToLegalize;
5100 
5101   // This is most likely DstTy (smaller then register size) packed in SrcTy
5102   // (larger then register size) and since unmerge was not combined it will be
5103   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
5104   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
5105 
5106   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
5107   //
5108   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
5109   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
5110   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
5111   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
5112   const int NumUnmerge = Unmerge->getNumOperands() - 1;
5113   const int PartsPerUnmerge = NumDst / NumUnmerge;
5114 
5115   for (int I = 0; I != NumUnmerge; ++I) {
5116     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
5117 
5118     for (int J = 0; J != PartsPerUnmerge; ++J)
5119       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
5120     MIB.addUse(Unmerge.getReg(I));
5121   }
5122 
5123   MI.eraseFromParent();
5124   return Legalized;
5125 }
5126 
5127 LegalizerHelper::LegalizeResult
5128 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
5129                                           LLT NarrowTy) {
5130   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5131   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
5132   // that should have been artifact combined. Most likely instruction that uses
5133   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
5134   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
5135   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5136   if (NarrowTy == SrcTy)
5137     return UnableToLegalize;
5138 
5139   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
5140   // is for old mir tests. Since the changes to more/fewer elements it should no
5141   // longer be possible to generate MIR like this when starting from llvm-ir
5142   // because LCMTy approach was replaced with merge/unmerge to vector elements.
5143   if (TypeIdx == 1) {
5144     assert(SrcTy.isVector() && "Expected vector types");
5145     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
5146     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
5147         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
5148       return UnableToLegalize;
5149     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
5150     //
5151     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
5152     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
5153     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
5154     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
5155     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
5156     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
5157 
5158     SmallVector<Register, 8> Elts;
5159     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
5160     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
5161       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
5162       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
5163         Elts.push_back(Unmerge.getReg(j));
5164     }
5165 
5166     SmallVector<Register, 8> NarrowTyElts;
5167     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
5168     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
5169     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
5170          ++i, Offset += NumNarrowTyElts) {
5171       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
5172       NarrowTyElts.push_back(
5173           MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
5174     }
5175 
5176     MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5177     MI.eraseFromParent();
5178     return Legalized;
5179   }
5180 
5181   assert(TypeIdx == 0 && "Bad type index");
5182   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
5183       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
5184     return UnableToLegalize;
5185 
5186   // This is most likely SrcTy (smaller then register size) packed in DstTy
5187   // (larger then register size) and since merge was not combined it will be
5188   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
5189   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
5190 
5191   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
5192   //
5193   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
5194   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
5195   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
5196   SmallVector<Register, 8> NarrowTyElts;
5197   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
5198   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
5199   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
5200   for (unsigned i = 0; i < NumParts; ++i) {
5201     SmallVector<Register, 8> Sources;
5202     for (unsigned j = 0; j < NumElts; ++j)
5203       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
5204     NarrowTyElts.push_back(
5205         MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
5206   }
5207 
5208   MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
5209   MI.eraseFromParent();
5210   return Legalized;
5211 }
5212 
5213 LegalizerHelper::LegalizeResult
5214 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
5215                                                            unsigned TypeIdx,
5216                                                            LLT NarrowVecTy) {
5217   auto [DstReg, SrcVec] = MI.getFirst2Regs();
5218   Register InsertVal;
5219   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
5220 
5221   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
5222   if (IsInsert)
5223     InsertVal = MI.getOperand(2).getReg();
5224 
5225   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
5226 
5227   // TODO: Handle total scalarization case.
5228   if (!NarrowVecTy.isVector())
5229     return UnableToLegalize;
5230 
5231   LLT VecTy = MRI.getType(SrcVec);
5232 
5233   // If the index is a constant, we can really break this down as you would
5234   // expect, and index into the target size pieces.
5235   int64_t IdxVal;
5236   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
5237   if (MaybeCst) {
5238     IdxVal = MaybeCst->Value.getSExtValue();
5239     // Avoid out of bounds indexing the pieces.
5240     if (IdxVal >= VecTy.getNumElements()) {
5241       MIRBuilder.buildUndef(DstReg);
5242       MI.eraseFromParent();
5243       return Legalized;
5244     }
5245 
5246     SmallVector<Register, 8> VecParts;
5247     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
5248 
5249     // Build a sequence of NarrowTy pieces in VecParts for this operand.
5250     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
5251                                     TargetOpcode::G_ANYEXT);
5252 
5253     unsigned NewNumElts = NarrowVecTy.getNumElements();
5254 
5255     LLT IdxTy = MRI.getType(Idx);
5256     int64_t PartIdx = IdxVal / NewNumElts;
5257     auto NewIdx =
5258         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
5259 
5260     if (IsInsert) {
5261       LLT PartTy = MRI.getType(VecParts[PartIdx]);
5262 
5263       // Use the adjusted index to insert into one of the subvectors.
5264       auto InsertPart = MIRBuilder.buildInsertVectorElement(
5265           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
5266       VecParts[PartIdx] = InsertPart.getReg(0);
5267 
5268       // Recombine the inserted subvector with the others to reform the result
5269       // vector.
5270       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
5271     } else {
5272       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
5273     }
5274 
5275     MI.eraseFromParent();
5276     return Legalized;
5277   }
5278 
5279   // With a variable index, we can't perform the operation in a smaller type, so
5280   // we're forced to expand this.
5281   //
5282   // TODO: We could emit a chain of compare/select to figure out which piece to
5283   // index.
5284   return lowerExtractInsertVectorElt(MI);
5285 }
5286 
5287 LegalizerHelper::LegalizeResult
5288 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
5289                                       LLT NarrowTy) {
5290   // FIXME: Don't know how to handle secondary types yet.
5291   if (TypeIdx != 0)
5292     return UnableToLegalize;
5293 
5294   if (!NarrowTy.isByteSized()) {
5295     LLVM_DEBUG(dbgs() << "Can't narrow load/store to non-byte-sized type\n");
5296     return UnableToLegalize;
5297   }
5298 
5299   // This implementation doesn't work for atomics. Give up instead of doing
5300   // something invalid.
5301   if (LdStMI.isAtomic())
5302     return UnableToLegalize;
5303 
5304   bool IsLoad = isa<GLoad>(LdStMI);
5305   Register ValReg = LdStMI.getReg(0);
5306   Register AddrReg = LdStMI.getPointerReg();
5307   LLT ValTy = MRI.getType(ValReg);
5308 
5309   // FIXME: Do we need a distinct NarrowMemory legalize action?
5310   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize().getValue()) {
5311     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
5312     return UnableToLegalize;
5313   }
5314 
5315   int NumParts = -1;
5316   int NumLeftover = -1;
5317   LLT LeftoverTy;
5318   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
5319   if (IsLoad) {
5320     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
5321   } else {
5322     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
5323                      NarrowLeftoverRegs, MIRBuilder, MRI)) {
5324       NumParts = NarrowRegs.size();
5325       NumLeftover = NarrowLeftoverRegs.size();
5326     }
5327   }
5328 
5329   if (NumParts == -1)
5330     return UnableToLegalize;
5331 
5332   LLT PtrTy = MRI.getType(AddrReg);
5333   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
5334 
5335   unsigned TotalSize = ValTy.getSizeInBits();
5336 
5337   // Split the load/store into PartTy sized pieces starting at Offset. If this
5338   // is a load, return the new registers in ValRegs. For a store, each elements
5339   // of ValRegs should be PartTy. Returns the next offset that needs to be
5340   // handled.
5341   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
5342   auto MMO = LdStMI.getMMO();
5343   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
5344                              unsigned NumParts, unsigned Offset) -> unsigned {
5345     MachineFunction &MF = MIRBuilder.getMF();
5346     unsigned PartSize = PartTy.getSizeInBits();
5347     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
5348          ++Idx) {
5349       unsigned ByteOffset = Offset / 8;
5350       Register NewAddrReg;
5351 
5352       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
5353 
5354       MachineMemOperand *NewMMO =
5355           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
5356 
5357       if (IsLoad) {
5358         Register Dst = MRI.createGenericVirtualRegister(PartTy);
5359         ValRegs.push_back(Dst);
5360         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
5361       } else {
5362         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
5363       }
5364       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
5365     }
5366 
5367     return Offset;
5368   };
5369 
5370   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
5371   unsigned HandledOffset =
5372       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
5373 
5374   // Handle the rest of the register if this isn't an even type breakdown.
5375   if (LeftoverTy.isValid())
5376     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
5377 
5378   if (IsLoad) {
5379     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
5380                 LeftoverTy, NarrowLeftoverRegs);
5381   }
5382 
5383   LdStMI.eraseFromParent();
5384   return Legalized;
5385 }
5386 
5387 LegalizerHelper::LegalizeResult
5388 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
5389                                      LLT NarrowTy) {
5390   using namespace TargetOpcode;
5391   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
5392   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5393 
5394   switch (MI.getOpcode()) {
5395   case G_IMPLICIT_DEF:
5396   case G_TRUNC:
5397   case G_AND:
5398   case G_OR:
5399   case G_XOR:
5400   case G_ADD:
5401   case G_SUB:
5402   case G_MUL:
5403   case G_PTR_ADD:
5404   case G_SMULH:
5405   case G_UMULH:
5406   case G_FADD:
5407   case G_FMUL:
5408   case G_FSUB:
5409   case G_FNEG:
5410   case G_FABS:
5411   case G_FCANONICALIZE:
5412   case G_FDIV:
5413   case G_FREM:
5414   case G_FMA:
5415   case G_FMAD:
5416   case G_FPOW:
5417   case G_FEXP:
5418   case G_FEXP2:
5419   case G_FEXP10:
5420   case G_FLOG:
5421   case G_FLOG2:
5422   case G_FLOG10:
5423   case G_FLDEXP:
5424   case G_FNEARBYINT:
5425   case G_FCEIL:
5426   case G_FFLOOR:
5427   case G_FRINT:
5428   case G_INTRINSIC_LRINT:
5429   case G_INTRINSIC_LLRINT:
5430   case G_INTRINSIC_ROUND:
5431   case G_INTRINSIC_ROUNDEVEN:
5432   case G_LROUND:
5433   case G_LLROUND:
5434   case G_INTRINSIC_TRUNC:
5435   case G_FCOS:
5436   case G_FSIN:
5437   case G_FTAN:
5438   case G_FACOS:
5439   case G_FASIN:
5440   case G_FATAN:
5441   case G_FATAN2:
5442   case G_FCOSH:
5443   case G_FSINH:
5444   case G_FTANH:
5445   case G_FSQRT:
5446   case G_BSWAP:
5447   case G_BITREVERSE:
5448   case G_SDIV:
5449   case G_UDIV:
5450   case G_SREM:
5451   case G_UREM:
5452   case G_SDIVREM:
5453   case G_UDIVREM:
5454   case G_SMIN:
5455   case G_SMAX:
5456   case G_UMIN:
5457   case G_UMAX:
5458   case G_ABS:
5459   case G_FMINNUM:
5460   case G_FMAXNUM:
5461   case G_FMINNUM_IEEE:
5462   case G_FMAXNUM_IEEE:
5463   case G_FMINIMUM:
5464   case G_FMAXIMUM:
5465   case G_FMINIMUMNUM:
5466   case G_FMAXIMUMNUM:
5467   case G_FSHL:
5468   case G_FSHR:
5469   case G_ROTL:
5470   case G_ROTR:
5471   case G_FREEZE:
5472   case G_SADDSAT:
5473   case G_SSUBSAT:
5474   case G_UADDSAT:
5475   case G_USUBSAT:
5476   case G_UMULO:
5477   case G_SMULO:
5478   case G_SHL:
5479   case G_LSHR:
5480   case G_ASHR:
5481   case G_SSHLSAT:
5482   case G_USHLSAT:
5483   case G_CTLZ:
5484   case G_CTLZ_ZERO_UNDEF:
5485   case G_CTTZ:
5486   case G_CTTZ_ZERO_UNDEF:
5487   case G_CTPOP:
5488   case G_FCOPYSIGN:
5489   case G_ZEXT:
5490   case G_SEXT:
5491   case G_ANYEXT:
5492   case G_FPEXT:
5493   case G_FPTRUNC:
5494   case G_SITOFP:
5495   case G_UITOFP:
5496   case G_FPTOSI:
5497   case G_FPTOUI:
5498   case G_FPTOSI_SAT:
5499   case G_FPTOUI_SAT:
5500   case G_INTTOPTR:
5501   case G_PTRTOINT:
5502   case G_ADDRSPACE_CAST:
5503   case G_UADDO:
5504   case G_USUBO:
5505   case G_UADDE:
5506   case G_USUBE:
5507   case G_SADDO:
5508   case G_SSUBO:
5509   case G_SADDE:
5510   case G_SSUBE:
5511   case G_STRICT_FADD:
5512   case G_STRICT_FSUB:
5513   case G_STRICT_FMUL:
5514   case G_STRICT_FMA:
5515   case G_STRICT_FLDEXP:
5516   case G_FFREXP:
5517     return fewerElementsVectorMultiEltType(GMI, NumElts);
5518   case G_ICMP:
5519   case G_FCMP:
5520     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
5521   case G_IS_FPCLASS:
5522     return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
5523   case G_SELECT:
5524     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
5525       return fewerElementsVectorMultiEltType(GMI, NumElts);
5526     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
5527   case G_PHI:
5528     return fewerElementsVectorPhi(GMI, NumElts);
5529   case G_UNMERGE_VALUES:
5530     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
5531   case G_BUILD_VECTOR:
5532     assert(TypeIdx == 0 && "not a vector type index");
5533     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5534   case G_CONCAT_VECTORS:
5535     if (TypeIdx != 1) // TODO: This probably does work as expected already.
5536       return UnableToLegalize;
5537     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
5538   case G_EXTRACT_VECTOR_ELT:
5539   case G_INSERT_VECTOR_ELT:
5540     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
5541   case G_LOAD:
5542   case G_STORE:
5543     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
5544   case G_SEXT_INREG:
5545     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
5546   GISEL_VECREDUCE_CASES_NONSEQ
5547     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
5548   case TargetOpcode::G_VECREDUCE_SEQ_FADD:
5549   case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
5550     return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
5551   case G_SHUFFLE_VECTOR:
5552     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
5553   case G_FPOWI:
5554     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
5555   case G_BITCAST:
5556     return fewerElementsBitcast(MI, TypeIdx, NarrowTy);
5557   case G_INTRINSIC_FPTRUNC_ROUND:
5558     return fewerElementsVectorMultiEltType(GMI, NumElts, {2});
5559   default:
5560     return UnableToLegalize;
5561   }
5562 }
5563 
5564 LegalizerHelper::LegalizeResult
5565 LegalizerHelper::fewerElementsBitcast(MachineInstr &MI, unsigned int TypeIdx,
5566                                       LLT NarrowTy) {
5567   assert(MI.getOpcode() == TargetOpcode::G_BITCAST &&
5568          "Not a bitcast operation");
5569 
5570   if (TypeIdx != 0)
5571     return UnableToLegalize;
5572 
5573   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5574 
5575   unsigned NewElemCount =
5576       NarrowTy.getSizeInBits() / SrcTy.getScalarSizeInBits();
5577   LLT SrcNarrowTy = LLT::fixed_vector(NewElemCount, SrcTy.getElementType());
5578 
5579   // Split the Src and Dst Reg into smaller registers
5580   SmallVector<Register> SrcVRegs, BitcastVRegs;
5581   if (extractGCDType(SrcVRegs, DstTy, SrcNarrowTy, SrcReg) != SrcNarrowTy)
5582     return UnableToLegalize;
5583 
5584   // Build new smaller bitcast instructions
5585   // Not supporting Leftover types for now but will have to
5586   for (Register Reg : SrcVRegs)
5587     BitcastVRegs.push_back(MIRBuilder.buildBitcast(NarrowTy, Reg).getReg(0));
5588 
5589   MIRBuilder.buildMergeLikeInstr(DstReg, BitcastVRegs);
5590   MI.eraseFromParent();
5591   return Legalized;
5592 }
5593 
5594 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
5595     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5596   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
5597   if (TypeIdx != 0)
5598     return UnableToLegalize;
5599 
5600   auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
5601       MI.getFirst3RegLLTs();
5602   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5603   // The shuffle should be canonicalized by now.
5604   if (DstTy != Src1Ty)
5605     return UnableToLegalize;
5606   if (DstTy != Src2Ty)
5607     return UnableToLegalize;
5608 
5609   if (!isPowerOf2_32(DstTy.getNumElements()))
5610     return UnableToLegalize;
5611 
5612   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
5613   // Further legalization attempts will be needed to do split further.
5614   NarrowTy =
5615       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
5616   unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
5617 
5618   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
5619   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
5620   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
5621   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
5622                         SplitSrc2Regs[1]};
5623 
5624   Register Hi, Lo;
5625 
5626   // If Lo or Hi uses elements from at most two of the four input vectors, then
5627   // express it as a vector shuffle of those two inputs.  Otherwise extract the
5628   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
5629   SmallVector<int, 16> Ops;
5630   for (unsigned High = 0; High < 2; ++High) {
5631     Register &Output = High ? Hi : Lo;
5632 
5633     // Build a shuffle mask for the output, discovering on the fly which
5634     // input vectors to use as shuffle operands (recorded in InputUsed).
5635     // If building a suitable shuffle vector proves too hard, then bail
5636     // out with useBuildVector set.
5637     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
5638     unsigned FirstMaskIdx = High * NewElts;
5639     bool UseBuildVector = false;
5640     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5641       // The mask element.  This indexes into the input.
5642       int Idx = Mask[FirstMaskIdx + MaskOffset];
5643 
5644       // The input vector this mask element indexes into.
5645       unsigned Input = (unsigned)Idx / NewElts;
5646 
5647       if (Input >= std::size(Inputs)) {
5648         // The mask element does not index into any input vector.
5649         Ops.push_back(-1);
5650         continue;
5651       }
5652 
5653       // Turn the index into an offset from the start of the input vector.
5654       Idx -= Input * NewElts;
5655 
5656       // Find or create a shuffle vector operand to hold this input.
5657       unsigned OpNo;
5658       for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
5659         if (InputUsed[OpNo] == Input) {
5660           // This input vector is already an operand.
5661           break;
5662         } else if (InputUsed[OpNo] == -1U) {
5663           // Create a new operand for this input vector.
5664           InputUsed[OpNo] = Input;
5665           break;
5666         }
5667       }
5668 
5669       if (OpNo >= std::size(InputUsed)) {
5670         // More than two input vectors used!  Give up on trying to create a
5671         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
5672         UseBuildVector = true;
5673         break;
5674       }
5675 
5676       // Add the mask index for the new shuffle vector.
5677       Ops.push_back(Idx + OpNo * NewElts);
5678     }
5679 
5680     if (UseBuildVector) {
5681       LLT EltTy = NarrowTy.getElementType();
5682       SmallVector<Register, 16> SVOps;
5683 
5684       // Extract the input elements by hand.
5685       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
5686         // The mask element.  This indexes into the input.
5687         int Idx = Mask[FirstMaskIdx + MaskOffset];
5688 
5689         // The input vector this mask element indexes into.
5690         unsigned Input = (unsigned)Idx / NewElts;
5691 
5692         if (Input >= std::size(Inputs)) {
5693           // The mask element is "undef" or indexes off the end of the input.
5694           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
5695           continue;
5696         }
5697 
5698         // Turn the index into an offset from the start of the input vector.
5699         Idx -= Input * NewElts;
5700 
5701         // Extract the vector element by hand.
5702         SVOps.push_back(MIRBuilder
5703                             .buildExtractVectorElement(
5704                                 EltTy, Inputs[Input],
5705                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
5706                             .getReg(0));
5707       }
5708 
5709       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
5710       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
5711     } else if (InputUsed[0] == -1U) {
5712       // No input vectors were used! The result is undefined.
5713       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
5714     } else {
5715       Register Op0 = Inputs[InputUsed[0]];
5716       // If only one input was used, use an undefined vector for the other.
5717       Register Op1 = InputUsed[1] == -1U
5718                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
5719                          : Inputs[InputUsed[1]];
5720       // At least one input vector was used. Create a new shuffle vector.
5721       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
5722     }
5723 
5724     Ops.clear();
5725   }
5726 
5727   MIRBuilder.buildMergeLikeInstr(DstReg, {Lo, Hi});
5728   MI.eraseFromParent();
5729   return Legalized;
5730 }
5731 
5732 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
5733     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
5734   auto &RdxMI = cast<GVecReduce>(MI);
5735 
5736   if (TypeIdx != 1)
5737     return UnableToLegalize;
5738 
5739   // The semantics of the normal non-sequential reductions allow us to freely
5740   // re-associate the operation.
5741   auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
5742 
5743   if (NarrowTy.isVector() &&
5744       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
5745     return UnableToLegalize;
5746 
5747   unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
5748   SmallVector<Register> SplitSrcs;
5749   // If NarrowTy is a scalar then we're being asked to scalarize.
5750   const unsigned NumParts =
5751       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
5752                           : SrcTy.getNumElements();
5753 
5754   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5755   if (NarrowTy.isScalar()) {
5756     if (DstTy != NarrowTy)
5757       return UnableToLegalize; // FIXME: handle implicit extensions.
5758 
5759     if (isPowerOf2_32(NumParts)) {
5760       // Generate a tree of scalar operations to reduce the critical path.
5761       SmallVector<Register> PartialResults;
5762       unsigned NumPartsLeft = NumParts;
5763       while (NumPartsLeft > 1) {
5764         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
5765           PartialResults.emplace_back(
5766               MIRBuilder
5767                   .buildInstr(ScalarOpc, {NarrowTy},
5768                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
5769                   .getReg(0));
5770         }
5771         SplitSrcs = PartialResults;
5772         PartialResults.clear();
5773         NumPartsLeft = SplitSrcs.size();
5774       }
5775       assert(SplitSrcs.size() == 1);
5776       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
5777       MI.eraseFromParent();
5778       return Legalized;
5779     }
5780     // If we can't generate a tree, then just do sequential operations.
5781     Register Acc = SplitSrcs[0];
5782     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
5783       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
5784                 .getReg(0);
5785     MIRBuilder.buildCopy(DstReg, Acc);
5786     MI.eraseFromParent();
5787     return Legalized;
5788   }
5789   SmallVector<Register> PartialReductions;
5790   for (unsigned Part = 0; Part < NumParts; ++Part) {
5791     PartialReductions.push_back(
5792         MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
5793             .getReg(0));
5794   }
5795 
5796   // If the types involved are powers of 2, we can generate intermediate vector
5797   // ops, before generating a final reduction operation.
5798   if (isPowerOf2_32(SrcTy.getNumElements()) &&
5799       isPowerOf2_32(NarrowTy.getNumElements())) {
5800     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
5801   }
5802 
5803   Register Acc = PartialReductions[0];
5804   for (unsigned Part = 1; Part < NumParts; ++Part) {
5805     if (Part == NumParts - 1) {
5806       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
5807                             {Acc, PartialReductions[Part]});
5808     } else {
5809       Acc = MIRBuilder
5810                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
5811                 .getReg(0);
5812     }
5813   }
5814   MI.eraseFromParent();
5815   return Legalized;
5816 }
5817 
5818 LegalizerHelper::LegalizeResult
5819 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
5820                                                   unsigned int TypeIdx,
5821                                                   LLT NarrowTy) {
5822   auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
5823       MI.getFirst3RegLLTs();
5824   if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
5825       DstTy != NarrowTy)
5826     return UnableToLegalize;
5827 
5828   assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
5829           MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
5830          "Unexpected vecreduce opcode");
5831   unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
5832                            ? TargetOpcode::G_FADD
5833                            : TargetOpcode::G_FMUL;
5834 
5835   SmallVector<Register> SplitSrcs;
5836   unsigned NumParts = SrcTy.getNumElements();
5837   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
5838   Register Acc = ScalarReg;
5839   for (unsigned i = 0; i < NumParts; i++)
5840     Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
5841               .getReg(0);
5842 
5843   MIRBuilder.buildCopy(DstReg, Acc);
5844   MI.eraseFromParent();
5845   return Legalized;
5846 }
5847 
5848 LegalizerHelper::LegalizeResult
5849 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
5850                                         LLT SrcTy, LLT NarrowTy,
5851                                         unsigned ScalarOpc) {
5852   SmallVector<Register> SplitSrcs;
5853   // Split the sources into NarrowTy size pieces.
5854   extractParts(SrcReg, NarrowTy,
5855                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
5856                MIRBuilder, MRI);
5857   // We're going to do a tree reduction using vector operations until we have
5858   // one NarrowTy size value left.
5859   while (SplitSrcs.size() > 1) {
5860     SmallVector<Register> PartialRdxs;
5861     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
5862       Register LHS = SplitSrcs[Idx];
5863       Register RHS = SplitSrcs[Idx + 1];
5864       // Create the intermediate vector op.
5865       Register Res =
5866           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
5867       PartialRdxs.push_back(Res);
5868     }
5869     SplitSrcs = std::move(PartialRdxs);
5870   }
5871   // Finally generate the requested NarrowTy based reduction.
5872   Observer.changingInstr(MI);
5873   MI.getOperand(1).setReg(SplitSrcs[0]);
5874   Observer.changedInstr(MI);
5875   return Legalized;
5876 }
5877 
5878 LegalizerHelper::LegalizeResult
5879 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
5880                                              const LLT HalfTy, const LLT AmtTy) {
5881 
5882   Register InL = MRI.createGenericVirtualRegister(HalfTy);
5883   Register InH = MRI.createGenericVirtualRegister(HalfTy);
5884   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5885 
5886   if (Amt.isZero()) {
5887     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
5888     MI.eraseFromParent();
5889     return Legalized;
5890   }
5891 
5892   LLT NVT = HalfTy;
5893   unsigned NVTBits = HalfTy.getSizeInBits();
5894   unsigned VTBits = 2 * NVTBits;
5895 
5896   SrcOp Lo(Register(0)), Hi(Register(0));
5897   if (MI.getOpcode() == TargetOpcode::G_SHL) {
5898     if (Amt.ugt(VTBits)) {
5899       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5900     } else if (Amt.ugt(NVTBits)) {
5901       Lo = MIRBuilder.buildConstant(NVT, 0);
5902       Hi = MIRBuilder.buildShl(NVT, InL,
5903                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5904     } else if (Amt == NVTBits) {
5905       Lo = MIRBuilder.buildConstant(NVT, 0);
5906       Hi = InL;
5907     } else {
5908       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
5909       auto OrLHS =
5910           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
5911       auto OrRHS = MIRBuilder.buildLShr(
5912           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5913       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5914     }
5915   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5916     if (Amt.ugt(VTBits)) {
5917       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
5918     } else if (Amt.ugt(NVTBits)) {
5919       Lo = MIRBuilder.buildLShr(NVT, InH,
5920                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5921       Hi = MIRBuilder.buildConstant(NVT, 0);
5922     } else if (Amt == NVTBits) {
5923       Lo = InH;
5924       Hi = MIRBuilder.buildConstant(NVT, 0);
5925     } else {
5926       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5927 
5928       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5929       auto OrRHS = MIRBuilder.buildShl(
5930           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5931 
5932       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5933       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
5934     }
5935   } else {
5936     if (Amt.ugt(VTBits)) {
5937       Hi = Lo = MIRBuilder.buildAShr(
5938           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5939     } else if (Amt.ugt(NVTBits)) {
5940       Lo = MIRBuilder.buildAShr(NVT, InH,
5941                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5942       Hi = MIRBuilder.buildAShr(NVT, InH,
5943                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5944     } else if (Amt == NVTBits) {
5945       Lo = InH;
5946       Hi = MIRBuilder.buildAShr(NVT, InH,
5947                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5948     } else {
5949       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5950 
5951       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5952       auto OrRHS = MIRBuilder.buildShl(
5953           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5954 
5955       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5956       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5957     }
5958   }
5959 
5960   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5961   MI.eraseFromParent();
5962 
5963   return Legalized;
5964 }
5965 
5966 // TODO: Optimize if constant shift amount.
5967 LegalizerHelper::LegalizeResult
5968 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5969                                    LLT RequestedTy) {
5970   if (TypeIdx == 1) {
5971     Observer.changingInstr(MI);
5972     narrowScalarSrc(MI, RequestedTy, 2);
5973     Observer.changedInstr(MI);
5974     return Legalized;
5975   }
5976 
5977   Register DstReg = MI.getOperand(0).getReg();
5978   LLT DstTy = MRI.getType(DstReg);
5979   if (DstTy.isVector())
5980     return UnableToLegalize;
5981 
5982   Register Amt = MI.getOperand(2).getReg();
5983   LLT ShiftAmtTy = MRI.getType(Amt);
5984   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5985   if (DstEltSize % 2 != 0)
5986     return UnableToLegalize;
5987 
5988   // Ignore the input type. We can only go to exactly half the size of the
5989   // input. If that isn't small enough, the resulting pieces will be further
5990   // legalized.
5991   const unsigned NewBitSize = DstEltSize / 2;
5992   const LLT HalfTy = LLT::scalar(NewBitSize);
5993   const LLT CondTy = LLT::scalar(1);
5994 
5995   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5996     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5997                                        ShiftAmtTy);
5998   }
5999 
6000   // TODO: Expand with known bits.
6001 
6002   // Handle the fully general expansion by an unknown amount.
6003   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
6004 
6005   Register InL = MRI.createGenericVirtualRegister(HalfTy);
6006   Register InH = MRI.createGenericVirtualRegister(HalfTy);
6007   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
6008 
6009   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
6010   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
6011 
6012   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
6013   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
6014   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
6015 
6016   Register ResultRegs[2];
6017   switch (MI.getOpcode()) {
6018   case TargetOpcode::G_SHL: {
6019     // Short: ShAmt < NewBitSize
6020     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
6021 
6022     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
6023     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
6024     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
6025 
6026     // Long: ShAmt >= NewBitSize
6027     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
6028     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
6029 
6030     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
6031     auto Hi = MIRBuilder.buildSelect(
6032         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
6033 
6034     ResultRegs[0] = Lo.getReg(0);
6035     ResultRegs[1] = Hi.getReg(0);
6036     break;
6037   }
6038   case TargetOpcode::G_LSHR:
6039   case TargetOpcode::G_ASHR: {
6040     // Short: ShAmt < NewBitSize
6041     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
6042 
6043     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
6044     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
6045     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
6046 
6047     // Long: ShAmt >= NewBitSize
6048     MachineInstrBuilder HiL;
6049     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
6050       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
6051     } else {
6052       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
6053       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
6054     }
6055     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
6056                                      {InH, AmtExcess});     // Lo from Hi part.
6057 
6058     auto Lo = MIRBuilder.buildSelect(
6059         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
6060 
6061     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
6062 
6063     ResultRegs[0] = Lo.getReg(0);
6064     ResultRegs[1] = Hi.getReg(0);
6065     break;
6066   }
6067   default:
6068     llvm_unreachable("not a shift");
6069   }
6070 
6071   MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
6072   MI.eraseFromParent();
6073   return Legalized;
6074 }
6075 
6076 LegalizerHelper::LegalizeResult
6077 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
6078                                        LLT MoreTy) {
6079   assert(TypeIdx == 0 && "Expecting only Idx 0");
6080 
6081   Observer.changingInstr(MI);
6082   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6083     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
6084     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
6085     moreElementsVectorSrc(MI, MoreTy, I);
6086   }
6087 
6088   MachineBasicBlock &MBB = *MI.getParent();
6089   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
6090   moreElementsVectorDst(MI, MoreTy, 0);
6091   Observer.changedInstr(MI);
6092   return Legalized;
6093 }
6094 
6095 MachineInstrBuilder LegalizerHelper::getNeutralElementForVecReduce(
6096     unsigned Opcode, MachineIRBuilder &MIRBuilder, LLT Ty) {
6097   assert(Ty.isScalar() && "Expected scalar type to make neutral element for");
6098 
6099   switch (Opcode) {
6100   default:
6101     llvm_unreachable(
6102         "getNeutralElementForVecReduce called with invalid opcode!");
6103   case TargetOpcode::G_VECREDUCE_ADD:
6104   case TargetOpcode::G_VECREDUCE_OR:
6105   case TargetOpcode::G_VECREDUCE_XOR:
6106   case TargetOpcode::G_VECREDUCE_UMAX:
6107     return MIRBuilder.buildConstant(Ty, 0);
6108   case TargetOpcode::G_VECREDUCE_MUL:
6109     return MIRBuilder.buildConstant(Ty, 1);
6110   case TargetOpcode::G_VECREDUCE_AND:
6111   case TargetOpcode::G_VECREDUCE_UMIN:
6112     return MIRBuilder.buildConstant(
6113         Ty, APInt::getAllOnes(Ty.getScalarSizeInBits()));
6114   case TargetOpcode::G_VECREDUCE_SMAX:
6115     return MIRBuilder.buildConstant(
6116         Ty, APInt::getSignedMinValue(Ty.getSizeInBits()));
6117   case TargetOpcode::G_VECREDUCE_SMIN:
6118     return MIRBuilder.buildConstant(
6119         Ty, APInt::getSignedMaxValue(Ty.getSizeInBits()));
6120   case TargetOpcode::G_VECREDUCE_FADD:
6121     return MIRBuilder.buildFConstant(Ty, -0.0);
6122   case TargetOpcode::G_VECREDUCE_FMUL:
6123     return MIRBuilder.buildFConstant(Ty, 1.0);
6124   case TargetOpcode::G_VECREDUCE_FMINIMUM:
6125   case TargetOpcode::G_VECREDUCE_FMAXIMUM:
6126     assert(false && "getNeutralElementForVecReduce unimplemented for "
6127                     "G_VECREDUCE_FMINIMUM and G_VECREDUCE_FMAXIMUM!");
6128   }
6129   llvm_unreachable("switch expected to return!");
6130 }
6131 
6132 LegalizerHelper::LegalizeResult
6133 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
6134                                     LLT MoreTy) {
6135   unsigned Opc = MI.getOpcode();
6136   switch (Opc) {
6137   case TargetOpcode::G_IMPLICIT_DEF:
6138   case TargetOpcode::G_LOAD: {
6139     if (TypeIdx != 0)
6140       return UnableToLegalize;
6141     Observer.changingInstr(MI);
6142     moreElementsVectorDst(MI, MoreTy, 0);
6143     Observer.changedInstr(MI);
6144     return Legalized;
6145   }
6146   case TargetOpcode::G_STORE:
6147     if (TypeIdx != 0)
6148       return UnableToLegalize;
6149     Observer.changingInstr(MI);
6150     moreElementsVectorSrc(MI, MoreTy, 0);
6151     Observer.changedInstr(MI);
6152     return Legalized;
6153   case TargetOpcode::G_AND:
6154   case TargetOpcode::G_OR:
6155   case TargetOpcode::G_XOR:
6156   case TargetOpcode::G_ADD:
6157   case TargetOpcode::G_SUB:
6158   case TargetOpcode::G_MUL:
6159   case TargetOpcode::G_FADD:
6160   case TargetOpcode::G_FSUB:
6161   case TargetOpcode::G_FMUL:
6162   case TargetOpcode::G_FDIV:
6163   case TargetOpcode::G_FCOPYSIGN:
6164   case TargetOpcode::G_UADDSAT:
6165   case TargetOpcode::G_USUBSAT:
6166   case TargetOpcode::G_SADDSAT:
6167   case TargetOpcode::G_SSUBSAT:
6168   case TargetOpcode::G_SMIN:
6169   case TargetOpcode::G_SMAX:
6170   case TargetOpcode::G_UMIN:
6171   case TargetOpcode::G_UMAX:
6172   case TargetOpcode::G_FMINNUM:
6173   case TargetOpcode::G_FMAXNUM:
6174   case TargetOpcode::G_FMINNUM_IEEE:
6175   case TargetOpcode::G_FMAXNUM_IEEE:
6176   case TargetOpcode::G_FMINIMUM:
6177   case TargetOpcode::G_FMAXIMUM:
6178   case TargetOpcode::G_FMINIMUMNUM:
6179   case TargetOpcode::G_FMAXIMUMNUM:
6180   case TargetOpcode::G_STRICT_FADD:
6181   case TargetOpcode::G_STRICT_FSUB:
6182   case TargetOpcode::G_STRICT_FMUL:
6183   case TargetOpcode::G_SHL:
6184   case TargetOpcode::G_ASHR:
6185   case TargetOpcode::G_LSHR: {
6186     Observer.changingInstr(MI);
6187     moreElementsVectorSrc(MI, MoreTy, 1);
6188     moreElementsVectorSrc(MI, MoreTy, 2);
6189     moreElementsVectorDst(MI, MoreTy, 0);
6190     Observer.changedInstr(MI);
6191     return Legalized;
6192   }
6193   case TargetOpcode::G_FMA:
6194   case TargetOpcode::G_STRICT_FMA:
6195   case TargetOpcode::G_FSHR:
6196   case TargetOpcode::G_FSHL: {
6197     Observer.changingInstr(MI);
6198     moreElementsVectorSrc(MI, MoreTy, 1);
6199     moreElementsVectorSrc(MI, MoreTy, 2);
6200     moreElementsVectorSrc(MI, MoreTy, 3);
6201     moreElementsVectorDst(MI, MoreTy, 0);
6202     Observer.changedInstr(MI);
6203     return Legalized;
6204   }
6205   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
6206   case TargetOpcode::G_EXTRACT:
6207     if (TypeIdx != 1)
6208       return UnableToLegalize;
6209     Observer.changingInstr(MI);
6210     moreElementsVectorSrc(MI, MoreTy, 1);
6211     Observer.changedInstr(MI);
6212     return Legalized;
6213   case TargetOpcode::G_INSERT:
6214   case TargetOpcode::G_INSERT_VECTOR_ELT:
6215   case TargetOpcode::G_FREEZE:
6216   case TargetOpcode::G_FNEG:
6217   case TargetOpcode::G_FABS:
6218   case TargetOpcode::G_FSQRT:
6219   case TargetOpcode::G_FCEIL:
6220   case TargetOpcode::G_FFLOOR:
6221   case TargetOpcode::G_FNEARBYINT:
6222   case TargetOpcode::G_FRINT:
6223   case TargetOpcode::G_INTRINSIC_ROUND:
6224   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
6225   case TargetOpcode::G_INTRINSIC_TRUNC:
6226   case TargetOpcode::G_BITREVERSE:
6227   case TargetOpcode::G_BSWAP:
6228   case TargetOpcode::G_FCANONICALIZE:
6229   case TargetOpcode::G_SEXT_INREG:
6230   case TargetOpcode::G_ABS:
6231   case TargetOpcode::G_CTLZ:
6232   case TargetOpcode::G_CTPOP:
6233     if (TypeIdx != 0)
6234       return UnableToLegalize;
6235     Observer.changingInstr(MI);
6236     moreElementsVectorSrc(MI, MoreTy, 1);
6237     moreElementsVectorDst(MI, MoreTy, 0);
6238     Observer.changedInstr(MI);
6239     return Legalized;
6240   case TargetOpcode::G_SELECT: {
6241     auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
6242     if (TypeIdx == 1) {
6243       if (!CondTy.isScalar() ||
6244           DstTy.getElementCount() != MoreTy.getElementCount())
6245         return UnableToLegalize;
6246 
6247       // This is turning a scalar select of vectors into a vector
6248       // select. Broadcast the select condition.
6249       auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
6250       Observer.changingInstr(MI);
6251       MI.getOperand(1).setReg(ShufSplat.getReg(0));
6252       Observer.changedInstr(MI);
6253       return Legalized;
6254     }
6255 
6256     if (CondTy.isVector())
6257       return UnableToLegalize;
6258 
6259     Observer.changingInstr(MI);
6260     moreElementsVectorSrc(MI, MoreTy, 2);
6261     moreElementsVectorSrc(MI, MoreTy, 3);
6262     moreElementsVectorDst(MI, MoreTy, 0);
6263     Observer.changedInstr(MI);
6264     return Legalized;
6265   }
6266   case TargetOpcode::G_UNMERGE_VALUES:
6267     return UnableToLegalize;
6268   case TargetOpcode::G_PHI:
6269     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
6270   case TargetOpcode::G_SHUFFLE_VECTOR:
6271     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
6272   case TargetOpcode::G_BUILD_VECTOR: {
6273     SmallVector<SrcOp, 8> Elts;
6274     for (auto Op : MI.uses()) {
6275       Elts.push_back(Op.getReg());
6276     }
6277 
6278     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
6279       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
6280     }
6281 
6282     MIRBuilder.buildDeleteTrailingVectorElements(
6283         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
6284     MI.eraseFromParent();
6285     return Legalized;
6286   }
6287   case TargetOpcode::G_SEXT:
6288   case TargetOpcode::G_ZEXT:
6289   case TargetOpcode::G_ANYEXT:
6290   case TargetOpcode::G_TRUNC:
6291   case TargetOpcode::G_FPTRUNC:
6292   case TargetOpcode::G_FPEXT:
6293   case TargetOpcode::G_FPTOSI:
6294   case TargetOpcode::G_FPTOUI:
6295   case TargetOpcode::G_FPTOSI_SAT:
6296   case TargetOpcode::G_FPTOUI_SAT:
6297   case TargetOpcode::G_SITOFP:
6298   case TargetOpcode::G_UITOFP: {
6299     Observer.changingInstr(MI);
6300     LLT SrcExtTy;
6301     LLT DstExtTy;
6302     if (TypeIdx == 0) {
6303       DstExtTy = MoreTy;
6304       SrcExtTy = LLT::fixed_vector(
6305           MoreTy.getNumElements(),
6306           MRI.getType(MI.getOperand(1).getReg()).getElementType());
6307     } else {
6308       DstExtTy = LLT::fixed_vector(
6309           MoreTy.getNumElements(),
6310           MRI.getType(MI.getOperand(0).getReg()).getElementType());
6311       SrcExtTy = MoreTy;
6312     }
6313     moreElementsVectorSrc(MI, SrcExtTy, 1);
6314     moreElementsVectorDst(MI, DstExtTy, 0);
6315     Observer.changedInstr(MI);
6316     return Legalized;
6317   }
6318   case TargetOpcode::G_ICMP:
6319   case TargetOpcode::G_FCMP: {
6320     if (TypeIdx != 1)
6321       return UnableToLegalize;
6322 
6323     Observer.changingInstr(MI);
6324     moreElementsVectorSrc(MI, MoreTy, 2);
6325     moreElementsVectorSrc(MI, MoreTy, 3);
6326     LLT CondTy = LLT::fixed_vector(
6327         MoreTy.getNumElements(),
6328         MRI.getType(MI.getOperand(0).getReg()).getElementType());
6329     moreElementsVectorDst(MI, CondTy, 0);
6330     Observer.changedInstr(MI);
6331     return Legalized;
6332   }
6333   case TargetOpcode::G_BITCAST: {
6334     if (TypeIdx != 0)
6335       return UnableToLegalize;
6336 
6337     LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
6338     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
6339 
6340     unsigned coefficient = SrcTy.getNumElements() * MoreTy.getNumElements();
6341     if (coefficient % DstTy.getNumElements() != 0)
6342       return UnableToLegalize;
6343 
6344     coefficient = coefficient / DstTy.getNumElements();
6345 
6346     LLT NewTy = SrcTy.changeElementCount(
6347         ElementCount::get(coefficient, MoreTy.isScalable()));
6348     Observer.changingInstr(MI);
6349     moreElementsVectorSrc(MI, NewTy, 1);
6350     moreElementsVectorDst(MI, MoreTy, 0);
6351     Observer.changedInstr(MI);
6352     return Legalized;
6353   }
6354   case TargetOpcode::G_VECREDUCE_FADD:
6355   case TargetOpcode::G_VECREDUCE_FMUL:
6356   case TargetOpcode::G_VECREDUCE_ADD:
6357   case TargetOpcode::G_VECREDUCE_MUL:
6358   case TargetOpcode::G_VECREDUCE_AND:
6359   case TargetOpcode::G_VECREDUCE_OR:
6360   case TargetOpcode::G_VECREDUCE_XOR:
6361   case TargetOpcode::G_VECREDUCE_SMAX:
6362   case TargetOpcode::G_VECREDUCE_SMIN:
6363   case TargetOpcode::G_VECREDUCE_UMAX:
6364   case TargetOpcode::G_VECREDUCE_UMIN: {
6365     LLT OrigTy = MRI.getType(MI.getOperand(1).getReg());
6366     MachineOperand &MO = MI.getOperand(1);
6367     auto NewVec = MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO);
6368     auto NeutralElement = getNeutralElementForVecReduce(
6369         MI.getOpcode(), MIRBuilder, MoreTy.getElementType());
6370 
6371     LLT IdxTy(TLI.getVectorIdxLLT(MIRBuilder.getDataLayout()));
6372     for (size_t i = OrigTy.getNumElements(), e = MoreTy.getNumElements();
6373          i != e; i++) {
6374       auto Idx = MIRBuilder.buildConstant(IdxTy, i);
6375       NewVec = MIRBuilder.buildInsertVectorElement(MoreTy, NewVec,
6376                                                    NeutralElement, Idx);
6377     }
6378 
6379     Observer.changingInstr(MI);
6380     MO.setReg(NewVec.getReg(0));
6381     Observer.changedInstr(MI);
6382     return Legalized;
6383   }
6384 
6385   default:
6386     return UnableToLegalize;
6387   }
6388 }
6389 
6390 LegalizerHelper::LegalizeResult
6391 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
6392   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6393   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6394   unsigned MaskNumElts = Mask.size();
6395   unsigned SrcNumElts = SrcTy.getNumElements();
6396   LLT DestEltTy = DstTy.getElementType();
6397 
6398   if (MaskNumElts == SrcNumElts)
6399     return Legalized;
6400 
6401   if (MaskNumElts < SrcNumElts) {
6402     // Extend mask to match new destination vector size with
6403     // undef values.
6404     SmallVector<int, 16> NewMask(SrcNumElts, -1);
6405     llvm::copy(Mask, NewMask.begin());
6406 
6407     moreElementsVectorDst(MI, SrcTy, 0);
6408     MIRBuilder.setInstrAndDebugLoc(MI);
6409     MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6410                                   MI.getOperand(1).getReg(),
6411                                   MI.getOperand(2).getReg(), NewMask);
6412     MI.eraseFromParent();
6413 
6414     return Legalized;
6415   }
6416 
6417   unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
6418   unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
6419   LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
6420 
6421   // Create new source vectors by concatenating the initial
6422   // source vectors with undefined vectors of the same size.
6423   auto Undef = MIRBuilder.buildUndef(SrcTy);
6424   SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
6425   SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
6426   MOps1[0] = MI.getOperand(1).getReg();
6427   MOps2[0] = MI.getOperand(2).getReg();
6428 
6429   auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
6430   auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
6431 
6432   // Readjust mask for new input vector length.
6433   SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
6434   for (unsigned I = 0; I != MaskNumElts; ++I) {
6435     int Idx = Mask[I];
6436     if (Idx >= static_cast<int>(SrcNumElts))
6437       Idx += PaddedMaskNumElts - SrcNumElts;
6438     MappedOps[I] = Idx;
6439   }
6440 
6441   // If we got more elements than required, extract subvector.
6442   if (MaskNumElts != PaddedMaskNumElts) {
6443     auto Shuffle =
6444         MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
6445 
6446     SmallVector<Register, 16> Elts(MaskNumElts);
6447     for (unsigned I = 0; I < MaskNumElts; ++I) {
6448       Elts[I] =
6449           MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
6450               .getReg(0);
6451     }
6452     MIRBuilder.buildBuildVector(DstReg, Elts);
6453   } else {
6454     MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
6455   }
6456 
6457   MI.eraseFromParent();
6458   return LegalizerHelper::LegalizeResult::Legalized;
6459 }
6460 
6461 LegalizerHelper::LegalizeResult
6462 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
6463                                            unsigned int TypeIdx, LLT MoreTy) {
6464   auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
6465   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
6466   unsigned NumElts = DstTy.getNumElements();
6467   unsigned WidenNumElts = MoreTy.getNumElements();
6468 
6469   if (DstTy.isVector() && Src1Ty.isVector() &&
6470       DstTy.getNumElements() != Src1Ty.getNumElements()) {
6471     return equalizeVectorShuffleLengths(MI);
6472   }
6473 
6474   if (TypeIdx != 0)
6475     return UnableToLegalize;
6476 
6477   // Expect a canonicalized shuffle.
6478   if (DstTy != Src1Ty || DstTy != Src2Ty)
6479     return UnableToLegalize;
6480 
6481   moreElementsVectorSrc(MI, MoreTy, 1);
6482   moreElementsVectorSrc(MI, MoreTy, 2);
6483 
6484   // Adjust mask based on new input vector length.
6485   SmallVector<int, 16> NewMask(WidenNumElts, -1);
6486   for (unsigned I = 0; I != NumElts; ++I) {
6487     int Idx = Mask[I];
6488     if (Idx < static_cast<int>(NumElts))
6489       NewMask[I] = Idx;
6490     else
6491       NewMask[I] = Idx - NumElts + WidenNumElts;
6492   }
6493   moreElementsVectorDst(MI, MoreTy, 0);
6494   MIRBuilder.setInstrAndDebugLoc(MI);
6495   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
6496                                 MI.getOperand(1).getReg(),
6497                                 MI.getOperand(2).getReg(), NewMask);
6498   MI.eraseFromParent();
6499   return Legalized;
6500 }
6501 
6502 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
6503                                         ArrayRef<Register> Src1Regs,
6504                                         ArrayRef<Register> Src2Regs,
6505                                         LLT NarrowTy) {
6506   MachineIRBuilder &B = MIRBuilder;
6507   unsigned SrcParts = Src1Regs.size();
6508   unsigned DstParts = DstRegs.size();
6509 
6510   unsigned DstIdx = 0; // Low bits of the result.
6511   Register FactorSum =
6512       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
6513   DstRegs[DstIdx] = FactorSum;
6514 
6515   Register CarrySumPrevDstIdx;
6516   SmallVector<Register, 4> Factors;
6517 
6518   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
6519     // Collect low parts of muls for DstIdx.
6520     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
6521          i <= std::min(DstIdx, SrcParts - 1); ++i) {
6522       MachineInstrBuilder Mul =
6523           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
6524       Factors.push_back(Mul.getReg(0));
6525     }
6526     // Collect high parts of muls from previous DstIdx.
6527     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
6528          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
6529       MachineInstrBuilder Umulh =
6530           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
6531       Factors.push_back(Umulh.getReg(0));
6532     }
6533     // Add CarrySum from additions calculated for previous DstIdx.
6534     if (DstIdx != 1) {
6535       Factors.push_back(CarrySumPrevDstIdx);
6536     }
6537 
6538     Register CarrySum;
6539     // Add all factors and accumulate all carries into CarrySum.
6540     if (DstIdx != DstParts - 1) {
6541       MachineInstrBuilder Uaddo =
6542           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
6543       FactorSum = Uaddo.getReg(0);
6544       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
6545       for (unsigned i = 2; i < Factors.size(); ++i) {
6546         MachineInstrBuilder Uaddo =
6547             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
6548         FactorSum = Uaddo.getReg(0);
6549         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
6550         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
6551       }
6552     } else {
6553       // Since value for the next index is not calculated, neither is CarrySum.
6554       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
6555       for (unsigned i = 2; i < Factors.size(); ++i)
6556         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
6557     }
6558 
6559     CarrySumPrevDstIdx = CarrySum;
6560     DstRegs[DstIdx] = FactorSum;
6561     Factors.clear();
6562   }
6563 }
6564 
6565 LegalizerHelper::LegalizeResult
6566 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
6567                                     LLT NarrowTy) {
6568   if (TypeIdx != 0)
6569     return UnableToLegalize;
6570 
6571   Register DstReg = MI.getOperand(0).getReg();
6572   LLT DstType = MRI.getType(DstReg);
6573   // FIXME: add support for vector types
6574   if (DstType.isVector())
6575     return UnableToLegalize;
6576 
6577   unsigned Opcode = MI.getOpcode();
6578   unsigned OpO, OpE, OpF;
6579   switch (Opcode) {
6580   case TargetOpcode::G_SADDO:
6581   case TargetOpcode::G_SADDE:
6582   case TargetOpcode::G_UADDO:
6583   case TargetOpcode::G_UADDE:
6584   case TargetOpcode::G_ADD:
6585     OpO = TargetOpcode::G_UADDO;
6586     OpE = TargetOpcode::G_UADDE;
6587     OpF = TargetOpcode::G_UADDE;
6588     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
6589       OpF = TargetOpcode::G_SADDE;
6590     break;
6591   case TargetOpcode::G_SSUBO:
6592   case TargetOpcode::G_SSUBE:
6593   case TargetOpcode::G_USUBO:
6594   case TargetOpcode::G_USUBE:
6595   case TargetOpcode::G_SUB:
6596     OpO = TargetOpcode::G_USUBO;
6597     OpE = TargetOpcode::G_USUBE;
6598     OpF = TargetOpcode::G_USUBE;
6599     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
6600       OpF = TargetOpcode::G_SSUBE;
6601     break;
6602   default:
6603     llvm_unreachable("Unexpected add/sub opcode!");
6604   }
6605 
6606   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
6607   unsigned NumDefs = MI.getNumExplicitDefs();
6608   Register Src1 = MI.getOperand(NumDefs).getReg();
6609   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
6610   Register CarryDst, CarryIn;
6611   if (NumDefs == 2)
6612     CarryDst = MI.getOperand(1).getReg();
6613   if (MI.getNumOperands() == NumDefs + 3)
6614     CarryIn = MI.getOperand(NumDefs + 2).getReg();
6615 
6616   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6617   LLT LeftoverTy, DummyTy;
6618   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
6619   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
6620                MIRBuilder, MRI);
6621   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
6622                MRI);
6623 
6624   int NarrowParts = Src1Regs.size();
6625   Src1Regs.append(Src1Left);
6626   Src2Regs.append(Src2Left);
6627   DstRegs.reserve(Src1Regs.size());
6628 
6629   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
6630     Register DstReg =
6631         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
6632     Register CarryOut;
6633     // Forward the final carry-out to the destination register
6634     if (i == e - 1 && CarryDst)
6635       CarryOut = CarryDst;
6636     else
6637       CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
6638 
6639     if (!CarryIn) {
6640       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
6641                             {Src1Regs[i], Src2Regs[i]});
6642     } else if (i == e - 1) {
6643       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
6644                             {Src1Regs[i], Src2Regs[i], CarryIn});
6645     } else {
6646       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
6647                             {Src1Regs[i], Src2Regs[i], CarryIn});
6648     }
6649 
6650     DstRegs.push_back(DstReg);
6651     CarryIn = CarryOut;
6652   }
6653   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
6654               ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
6655               ArrayRef(DstRegs).drop_front(NarrowParts));
6656 
6657   MI.eraseFromParent();
6658   return Legalized;
6659 }
6660 
6661 LegalizerHelper::LegalizeResult
6662 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
6663   auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
6664 
6665   LLT Ty = MRI.getType(DstReg);
6666   if (Ty.isVector())
6667     return UnableToLegalize;
6668 
6669   unsigned Size = Ty.getSizeInBits();
6670   unsigned NarrowSize = NarrowTy.getSizeInBits();
6671   if (Size % NarrowSize != 0)
6672     return UnableToLegalize;
6673 
6674   unsigned NumParts = Size / NarrowSize;
6675   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
6676   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
6677 
6678   SmallVector<Register, 2> Src1Parts, Src2Parts;
6679   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
6680   extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
6681   extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
6682   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
6683 
6684   // Take only high half of registers if this is high mul.
6685   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
6686   MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6687   MI.eraseFromParent();
6688   return Legalized;
6689 }
6690 
6691 LegalizerHelper::LegalizeResult
6692 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
6693                                    LLT NarrowTy) {
6694   if (TypeIdx != 0)
6695     return UnableToLegalize;
6696 
6697   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
6698 
6699   Register Src = MI.getOperand(1).getReg();
6700   LLT SrcTy = MRI.getType(Src);
6701 
6702   // If all finite floats fit into the narrowed integer type, we can just swap
6703   // out the result type. This is practically only useful for conversions from
6704   // half to at least 16-bits, so just handle the one case.
6705   if (SrcTy.getScalarType() != LLT::scalar(16) ||
6706       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
6707     return UnableToLegalize;
6708 
6709   Observer.changingInstr(MI);
6710   narrowScalarDst(MI, NarrowTy, 0,
6711                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
6712   Observer.changedInstr(MI);
6713   return Legalized;
6714 }
6715 
6716 LegalizerHelper::LegalizeResult
6717 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
6718                                      LLT NarrowTy) {
6719   if (TypeIdx != 1)
6720     return UnableToLegalize;
6721 
6722   uint64_t NarrowSize = NarrowTy.getSizeInBits();
6723 
6724   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6725   // FIXME: add support for when SizeOp1 isn't an exact multiple of
6726   // NarrowSize.
6727   if (SizeOp1 % NarrowSize != 0)
6728     return UnableToLegalize;
6729   int NumParts = SizeOp1 / NarrowSize;
6730 
6731   SmallVector<Register, 2> SrcRegs, DstRegs;
6732   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
6733                MIRBuilder, MRI);
6734 
6735   Register OpReg = MI.getOperand(0).getReg();
6736   uint64_t OpStart = MI.getOperand(2).getImm();
6737   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6738   for (int i = 0; i < NumParts; ++i) {
6739     unsigned SrcStart = i * NarrowSize;
6740 
6741     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
6742       // No part of the extract uses this subregister, ignore it.
6743       continue;
6744     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6745       // The entire subregister is extracted, forward the value.
6746       DstRegs.push_back(SrcRegs[i]);
6747       continue;
6748     }
6749 
6750     // OpSegStart is where this destination segment would start in OpReg if it
6751     // extended infinitely in both directions.
6752     int64_t ExtractOffset;
6753     uint64_t SegSize;
6754     if (OpStart < SrcStart) {
6755       ExtractOffset = 0;
6756       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
6757     } else {
6758       ExtractOffset = OpStart - SrcStart;
6759       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
6760     }
6761 
6762     Register SegReg = SrcRegs[i];
6763     if (ExtractOffset != 0 || SegSize != NarrowSize) {
6764       // A genuine extract is needed.
6765       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6766       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
6767     }
6768 
6769     DstRegs.push_back(SegReg);
6770   }
6771 
6772   Register DstReg = MI.getOperand(0).getReg();
6773   if (MRI.getType(DstReg).isVector())
6774     MIRBuilder.buildBuildVector(DstReg, DstRegs);
6775   else if (DstRegs.size() > 1)
6776     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6777   else
6778     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
6779   MI.eraseFromParent();
6780   return Legalized;
6781 }
6782 
6783 LegalizerHelper::LegalizeResult
6784 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
6785                                     LLT NarrowTy) {
6786   // FIXME: Don't know how to handle secondary types yet.
6787   if (TypeIdx != 0)
6788     return UnableToLegalize;
6789 
6790   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
6791   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
6792   LLT LeftoverTy;
6793   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
6794                LeftoverRegs, MIRBuilder, MRI);
6795 
6796   SrcRegs.append(LeftoverRegs);
6797 
6798   uint64_t NarrowSize = NarrowTy.getSizeInBits();
6799   Register OpReg = MI.getOperand(2).getReg();
6800   uint64_t OpStart = MI.getOperand(3).getImm();
6801   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
6802   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
6803     unsigned DstStart = I * NarrowSize;
6804 
6805     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
6806       // The entire subregister is defined by this insert, forward the new
6807       // value.
6808       DstRegs.push_back(OpReg);
6809       continue;
6810     }
6811 
6812     Register SrcReg = SrcRegs[I];
6813     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
6814       // The leftover reg is smaller than NarrowTy, so we need to extend it.
6815       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
6816       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
6817     }
6818 
6819     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
6820       // No part of the insert affects this subregister, forward the original.
6821       DstRegs.push_back(SrcReg);
6822       continue;
6823     }
6824 
6825     // OpSegStart is where this destination segment would start in OpReg if it
6826     // extended infinitely in both directions.
6827     int64_t ExtractOffset, InsertOffset;
6828     uint64_t SegSize;
6829     if (OpStart < DstStart) {
6830       InsertOffset = 0;
6831       ExtractOffset = DstStart - OpStart;
6832       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
6833     } else {
6834       InsertOffset = OpStart - DstStart;
6835       ExtractOffset = 0;
6836       SegSize =
6837         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
6838     }
6839 
6840     Register SegReg = OpReg;
6841     if (ExtractOffset != 0 || SegSize != OpSize) {
6842       // A genuine extract is needed.
6843       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
6844       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
6845     }
6846 
6847     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
6848     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
6849     DstRegs.push_back(DstReg);
6850   }
6851 
6852   uint64_t WideSize = DstRegs.size() * NarrowSize;
6853   Register DstReg = MI.getOperand(0).getReg();
6854   if (WideSize > RegTy.getSizeInBits()) {
6855     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
6856     MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
6857     MIRBuilder.buildTrunc(DstReg, MergeReg);
6858   } else
6859     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
6860 
6861   MI.eraseFromParent();
6862   return Legalized;
6863 }
6864 
6865 LegalizerHelper::LegalizeResult
6866 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
6867                                    LLT NarrowTy) {
6868   Register DstReg = MI.getOperand(0).getReg();
6869   LLT DstTy = MRI.getType(DstReg);
6870 
6871   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
6872 
6873   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6874   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
6875   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6876   LLT LeftoverTy;
6877   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
6878                     Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
6879     return UnableToLegalize;
6880 
6881   LLT Unused;
6882   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
6883                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6884     llvm_unreachable("inconsistent extractParts result");
6885 
6886   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6887     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
6888                                         {Src0Regs[I], Src1Regs[I]});
6889     DstRegs.push_back(Inst.getReg(0));
6890   }
6891 
6892   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6893     auto Inst = MIRBuilder.buildInstr(
6894       MI.getOpcode(),
6895       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
6896     DstLeftoverRegs.push_back(Inst.getReg(0));
6897   }
6898 
6899   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6900               LeftoverTy, DstLeftoverRegs);
6901 
6902   MI.eraseFromParent();
6903   return Legalized;
6904 }
6905 
6906 LegalizerHelper::LegalizeResult
6907 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
6908                                  LLT NarrowTy) {
6909   if (TypeIdx != 0)
6910     return UnableToLegalize;
6911 
6912   auto [DstReg, SrcReg] = MI.getFirst2Regs();
6913 
6914   LLT DstTy = MRI.getType(DstReg);
6915   if (DstTy.isVector())
6916     return UnableToLegalize;
6917 
6918   SmallVector<Register, 8> Parts;
6919   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
6920   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
6921   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
6922 
6923   MI.eraseFromParent();
6924   return Legalized;
6925 }
6926 
6927 LegalizerHelper::LegalizeResult
6928 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
6929                                     LLT NarrowTy) {
6930   if (TypeIdx != 0)
6931     return UnableToLegalize;
6932 
6933   Register CondReg = MI.getOperand(1).getReg();
6934   LLT CondTy = MRI.getType(CondReg);
6935   if (CondTy.isVector()) // TODO: Handle vselect
6936     return UnableToLegalize;
6937 
6938   Register DstReg = MI.getOperand(0).getReg();
6939   LLT DstTy = MRI.getType(DstReg);
6940 
6941   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
6942   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
6943   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
6944   LLT LeftoverTy;
6945   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
6946                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
6947     return UnableToLegalize;
6948 
6949   LLT Unused;
6950   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
6951                     Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
6952     llvm_unreachable("inconsistent extractParts result");
6953 
6954   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
6955     auto Select = MIRBuilder.buildSelect(NarrowTy,
6956                                          CondReg, Src1Regs[I], Src2Regs[I]);
6957     DstRegs.push_back(Select.getReg(0));
6958   }
6959 
6960   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
6961     auto Select = MIRBuilder.buildSelect(
6962       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
6963     DstLeftoverRegs.push_back(Select.getReg(0));
6964   }
6965 
6966   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
6967               LeftoverTy, DstLeftoverRegs);
6968 
6969   MI.eraseFromParent();
6970   return Legalized;
6971 }
6972 
6973 LegalizerHelper::LegalizeResult
6974 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
6975                                   LLT NarrowTy) {
6976   if (TypeIdx != 1)
6977     return UnableToLegalize;
6978 
6979   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6980   unsigned NarrowSize = NarrowTy.getSizeInBits();
6981 
6982   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6983     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
6984 
6985     MachineIRBuilder &B = MIRBuilder;
6986     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
6987     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
6988     auto C_0 = B.buildConstant(NarrowTy, 0);
6989     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
6990                                 UnmergeSrc.getReg(1), C_0);
6991     auto LoCTLZ = IsUndef ?
6992       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
6993       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
6994     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
6995     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
6996     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
6997     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
6998 
6999     MI.eraseFromParent();
7000     return Legalized;
7001   }
7002 
7003   return UnableToLegalize;
7004 }
7005 
7006 LegalizerHelper::LegalizeResult
7007 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
7008                                   LLT NarrowTy) {
7009   if (TypeIdx != 1)
7010     return UnableToLegalize;
7011 
7012   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7013   unsigned NarrowSize = NarrowTy.getSizeInBits();
7014 
7015   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7016     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
7017 
7018     MachineIRBuilder &B = MIRBuilder;
7019     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
7020     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
7021     auto C_0 = B.buildConstant(NarrowTy, 0);
7022     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
7023                                 UnmergeSrc.getReg(0), C_0);
7024     auto HiCTTZ = IsUndef ?
7025       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
7026       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
7027     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
7028     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
7029     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
7030     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
7031 
7032     MI.eraseFromParent();
7033     return Legalized;
7034   }
7035 
7036   return UnableToLegalize;
7037 }
7038 
7039 LegalizerHelper::LegalizeResult
7040 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
7041                                    LLT NarrowTy) {
7042   if (TypeIdx != 1)
7043     return UnableToLegalize;
7044 
7045   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7046   unsigned NarrowSize = NarrowTy.getSizeInBits();
7047 
7048   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
7049     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
7050 
7051     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
7052     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
7053     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
7054 
7055     MI.eraseFromParent();
7056     return Legalized;
7057   }
7058 
7059   return UnableToLegalize;
7060 }
7061 
7062 LegalizerHelper::LegalizeResult
7063 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
7064                                     LLT NarrowTy) {
7065   if (TypeIdx != 1)
7066     return UnableToLegalize;
7067 
7068   MachineIRBuilder &B = MIRBuilder;
7069   Register ExpReg = MI.getOperand(2).getReg();
7070   LLT ExpTy = MRI.getType(ExpReg);
7071 
7072   unsigned ClampSize = NarrowTy.getScalarSizeInBits();
7073 
7074   // Clamp the exponent to the range of the target type.
7075   auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
7076   auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
7077   auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
7078   auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
7079 
7080   auto Trunc = B.buildTrunc(NarrowTy, Clamp);
7081   Observer.changingInstr(MI);
7082   MI.getOperand(2).setReg(Trunc.getReg(0));
7083   Observer.changedInstr(MI);
7084   return Legalized;
7085 }
7086 
7087 LegalizerHelper::LegalizeResult
7088 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
7089   unsigned Opc = MI.getOpcode();
7090   const auto &TII = MIRBuilder.getTII();
7091   auto isSupported = [this](const LegalityQuery &Q) {
7092     auto QAction = LI.getAction(Q).Action;
7093     return QAction == Legal || QAction == Libcall || QAction == Custom;
7094   };
7095   switch (Opc) {
7096   default:
7097     return UnableToLegalize;
7098   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
7099     // This trivially expands to CTLZ.
7100     Observer.changingInstr(MI);
7101     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
7102     Observer.changedInstr(MI);
7103     return Legalized;
7104   }
7105   case TargetOpcode::G_CTLZ: {
7106     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7107     unsigned Len = SrcTy.getSizeInBits();
7108 
7109     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7110       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
7111       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
7112       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
7113       auto ICmp = MIRBuilder.buildICmp(
7114           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
7115       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7116       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
7117       MI.eraseFromParent();
7118       return Legalized;
7119     }
7120     // for now, we do this:
7121     // NewLen = NextPowerOf2(Len);
7122     // x = x | (x >> 1);
7123     // x = x | (x >> 2);
7124     // ...
7125     // x = x | (x >>16);
7126     // x = x | (x >>32); // for 64-bit input
7127     // Upto NewLen/2
7128     // return Len - popcount(x);
7129     //
7130     // Ref: "Hacker's Delight" by Henry Warren
7131     Register Op = SrcReg;
7132     unsigned NewLen = PowerOf2Ceil(Len);
7133     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
7134       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
7135       auto MIBOp = MIRBuilder.buildOr(
7136           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
7137       Op = MIBOp.getReg(0);
7138     }
7139     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
7140     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
7141                         MIBPop);
7142     MI.eraseFromParent();
7143     return Legalized;
7144   }
7145   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
7146     // This trivially expands to CTTZ.
7147     Observer.changingInstr(MI);
7148     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
7149     Observer.changedInstr(MI);
7150     return Legalized;
7151   }
7152   case TargetOpcode::G_CTTZ: {
7153     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7154 
7155     unsigned Len = SrcTy.getSizeInBits();
7156     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
7157       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
7158       // zero.
7159       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
7160       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
7161       auto ICmp = MIRBuilder.buildICmp(
7162           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
7163       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
7164       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
7165       MI.eraseFromParent();
7166       return Legalized;
7167     }
7168     // for now, we use: { return popcount(~x & (x - 1)); }
7169     // unless the target has ctlz but not ctpop, in which case we use:
7170     // { return 32 - nlz(~x & (x-1)); }
7171     // Ref: "Hacker's Delight" by Henry Warren
7172     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
7173     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
7174     auto MIBTmp = MIRBuilder.buildAnd(
7175         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
7176     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
7177         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
7178       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
7179       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
7180                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
7181       MI.eraseFromParent();
7182       return Legalized;
7183     }
7184     Observer.changingInstr(MI);
7185     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
7186     MI.getOperand(1).setReg(MIBTmp.getReg(0));
7187     Observer.changedInstr(MI);
7188     return Legalized;
7189   }
7190   case TargetOpcode::G_CTPOP: {
7191     Register SrcReg = MI.getOperand(1).getReg();
7192     LLT Ty = MRI.getType(SrcReg);
7193     unsigned Size = Ty.getSizeInBits();
7194     MachineIRBuilder &B = MIRBuilder;
7195 
7196     // Count set bits in blocks of 2 bits. Default approach would be
7197     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
7198     // We use following formula instead:
7199     // B2Count = val - { (val >> 1) & 0x55555555 }
7200     // since it gives same result in blocks of 2 with one instruction less.
7201     auto C_1 = B.buildConstant(Ty, 1);
7202     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
7203     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
7204     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
7205     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
7206     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
7207 
7208     // In order to get count in blocks of 4 add values from adjacent block of 2.
7209     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
7210     auto C_2 = B.buildConstant(Ty, 2);
7211     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
7212     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
7213     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
7214     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
7215     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
7216     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
7217 
7218     // For count in blocks of 8 bits we don't have to mask high 4 bits before
7219     // addition since count value sits in range {0,...,8} and 4 bits are enough
7220     // to hold such binary values. After addition high 4 bits still hold count
7221     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
7222     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
7223     auto C_4 = B.buildConstant(Ty, 4);
7224     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
7225     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
7226     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
7227     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
7228     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
7229 
7230     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
7231     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
7232     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
7233     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
7234 
7235     // Shift count result from 8 high bits to low bits.
7236     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
7237 
7238     auto IsMulSupported = [this](const LLT Ty) {
7239       auto Action = LI.getAction({TargetOpcode::G_MUL, {Ty}}).Action;
7240       return Action == Legal || Action == WidenScalar || Action == Custom;
7241     };
7242     if (IsMulSupported(Ty)) {
7243       auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
7244       B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7245     } else {
7246       auto ResTmp = B8Count;
7247       for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
7248         auto ShiftC = B.buildConstant(Ty, Shift);
7249         auto Shl = B.buildShl(Ty, ResTmp, ShiftC);
7250         ResTmp = B.buildAdd(Ty, ResTmp, Shl);
7251       }
7252       B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
7253     }
7254     MI.eraseFromParent();
7255     return Legalized;
7256   }
7257   }
7258 }
7259 
7260 // Check that (every element of) Reg is undef or not an exact multiple of BW.
7261 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
7262                                         Register Reg, unsigned BW) {
7263   return matchUnaryPredicate(
7264       MRI, Reg,
7265       [=](const Constant *C) {
7266         // Null constant here means an undef.
7267         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
7268         return !CI || CI->getValue().urem(BW) != 0;
7269       },
7270       /*AllowUndefs*/ true);
7271 }
7272 
7273 LegalizerHelper::LegalizeResult
7274 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
7275   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7276   LLT Ty = MRI.getType(Dst);
7277   LLT ShTy = MRI.getType(Z);
7278 
7279   unsigned BW = Ty.getScalarSizeInBits();
7280 
7281   if (!isPowerOf2_32(BW))
7282     return UnableToLegalize;
7283 
7284   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7285   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7286 
7287   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7288     // fshl X, Y, Z -> fshr X, Y, -Z
7289     // fshr X, Y, Z -> fshl X, Y, -Z
7290     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
7291     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
7292   } else {
7293     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
7294     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
7295     auto One = MIRBuilder.buildConstant(ShTy, 1);
7296     if (IsFSHL) {
7297       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7298       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
7299     } else {
7300       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
7301       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
7302     }
7303 
7304     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
7305   }
7306 
7307   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
7308   MI.eraseFromParent();
7309   return Legalized;
7310 }
7311 
7312 LegalizerHelper::LegalizeResult
7313 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
7314   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
7315   LLT Ty = MRI.getType(Dst);
7316   LLT ShTy = MRI.getType(Z);
7317 
7318   const unsigned BW = Ty.getScalarSizeInBits();
7319   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7320 
7321   Register ShX, ShY;
7322   Register ShAmt, InvShAmt;
7323 
7324   // FIXME: Emit optimized urem by constant instead of letting it expand later.
7325   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
7326     // fshl: X << C | Y >> (BW - C)
7327     // fshr: X << (BW - C) | Y >> C
7328     // where C = Z % BW is not zero
7329     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7330     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7331     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
7332     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
7333     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
7334   } else {
7335     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
7336     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
7337     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
7338     if (isPowerOf2_32(BW)) {
7339       // Z % BW -> Z & (BW - 1)
7340       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
7341       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
7342       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
7343       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
7344     } else {
7345       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
7346       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
7347       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
7348     }
7349 
7350     auto One = MIRBuilder.buildConstant(ShTy, 1);
7351     if (IsFSHL) {
7352       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
7353       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
7354       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
7355     } else {
7356       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
7357       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
7358       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
7359     }
7360   }
7361 
7362   MIRBuilder.buildOr(Dst, ShX, ShY, MachineInstr::Disjoint);
7363   MI.eraseFromParent();
7364   return Legalized;
7365 }
7366 
7367 LegalizerHelper::LegalizeResult
7368 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
7369   // These operations approximately do the following (while avoiding undefined
7370   // shifts by BW):
7371   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
7372   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
7373   Register Dst = MI.getOperand(0).getReg();
7374   LLT Ty = MRI.getType(Dst);
7375   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
7376 
7377   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
7378   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
7379 
7380   // TODO: Use smarter heuristic that accounts for vector legalization.
7381   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
7382     return lowerFunnelShiftAsShifts(MI);
7383 
7384   // This only works for powers of 2, fallback to shifts if it fails.
7385   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
7386   if (Result == UnableToLegalize)
7387     return lowerFunnelShiftAsShifts(MI);
7388   return Result;
7389 }
7390 
7391 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
7392   auto [Dst, Src] = MI.getFirst2Regs();
7393   LLT DstTy = MRI.getType(Dst);
7394   LLT SrcTy = MRI.getType(Src);
7395 
7396   uint32_t DstTySize = DstTy.getSizeInBits();
7397   uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
7398   uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
7399 
7400   if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
7401       !isPowerOf2_32(SrcTyScalarSize))
7402     return UnableToLegalize;
7403 
7404   // The step between extend is too large, split it by creating an intermediate
7405   // extend instruction
7406   if (SrcTyScalarSize * 2 < DstTyScalarSize) {
7407     LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
7408     // If the destination type is illegal, split it into multiple statements
7409     // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
7410     auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
7411     // Unmerge the vector
7412     LLT EltTy = MidTy.changeElementCount(
7413         MidTy.getElementCount().divideCoefficientBy(2));
7414     auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
7415 
7416     // ZExt the vectors
7417     LLT ZExtResTy = DstTy.changeElementCount(
7418         DstTy.getElementCount().divideCoefficientBy(2));
7419     auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7420                                           {UnmergeSrc.getReg(0)});
7421     auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
7422                                           {UnmergeSrc.getReg(1)});
7423 
7424     // Merge the ending vectors
7425     MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
7426 
7427     MI.eraseFromParent();
7428     return Legalized;
7429   }
7430   return UnableToLegalize;
7431 }
7432 
7433 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
7434   // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
7435   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
7436   // Similar to how operand splitting is done in SelectiondDAG, we can handle
7437   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
7438   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
7439   //   %lo16(<4 x s16>) = G_TRUNC %inlo
7440   //   %hi16(<4 x s16>) = G_TRUNC %inhi
7441   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
7442   //   %res(<8 x s8>) = G_TRUNC %in16
7443 
7444   assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
7445 
7446   Register DstReg = MI.getOperand(0).getReg();
7447   Register SrcReg = MI.getOperand(1).getReg();
7448   LLT DstTy = MRI.getType(DstReg);
7449   LLT SrcTy = MRI.getType(SrcReg);
7450 
7451   if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
7452       isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
7453       isPowerOf2_32(SrcTy.getNumElements()) &&
7454       isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
7455     // Split input type.
7456     LLT SplitSrcTy = SrcTy.changeElementCount(
7457         SrcTy.getElementCount().divideCoefficientBy(2));
7458 
7459     // First, split the source into two smaller vectors.
7460     SmallVector<Register, 2> SplitSrcs;
7461     extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
7462 
7463     // Truncate the splits into intermediate narrower elements.
7464     LLT InterTy;
7465     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7466       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
7467     else
7468       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
7469     for (Register &Src : SplitSrcs)
7470       Src = MIRBuilder.buildTrunc(InterTy, Src).getReg(0);
7471 
7472     // Combine the new truncates into one vector
7473     auto Merge = MIRBuilder.buildMergeLikeInstr(
7474         DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
7475 
7476     // Truncate the new vector to the final result type
7477     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
7478       MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
7479     else
7480       MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
7481 
7482     MI.eraseFromParent();
7483 
7484     return Legalized;
7485   }
7486   return UnableToLegalize;
7487 }
7488 
7489 LegalizerHelper::LegalizeResult
7490 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
7491   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7492   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7493   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7494   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7495   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7496   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
7497   MI.eraseFromParent();
7498   return Legalized;
7499 }
7500 
7501 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
7502   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
7503 
7504   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
7505   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
7506 
7507   MIRBuilder.setInstrAndDebugLoc(MI);
7508 
7509   // If a rotate in the other direction is supported, use it.
7510   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
7511   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
7512       isPowerOf2_32(EltSizeInBits))
7513     return lowerRotateWithReverseRotate(MI);
7514 
7515   // If a funnel shift is supported, use it.
7516   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7517   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
7518   bool IsFShLegal = false;
7519   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
7520       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
7521     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
7522                                 Register R3) {
7523       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
7524       MI.eraseFromParent();
7525       return Legalized;
7526     };
7527     // If a funnel shift in the other direction is supported, use it.
7528     if (IsFShLegal) {
7529       return buildFunnelShift(FShOpc, Dst, Src, Amt);
7530     } else if (isPowerOf2_32(EltSizeInBits)) {
7531       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
7532       return buildFunnelShift(RevFsh, Dst, Src, Amt);
7533     }
7534   }
7535 
7536   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
7537   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
7538   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
7539   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
7540   Register ShVal;
7541   Register RevShiftVal;
7542   if (isPowerOf2_32(EltSizeInBits)) {
7543     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
7544     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
7545     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
7546     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
7547     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7548     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
7549     RevShiftVal =
7550         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
7551   } else {
7552     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
7553     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
7554     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
7555     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
7556     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
7557     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
7558     auto One = MIRBuilder.buildConstant(AmtTy, 1);
7559     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
7560     RevShiftVal =
7561         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
7562   }
7563   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
7564   MI.eraseFromParent();
7565   return Legalized;
7566 }
7567 
7568 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
7569 // representation.
7570 LegalizerHelper::LegalizeResult
7571 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
7572   auto [Dst, Src] = MI.getFirst2Regs();
7573   const LLT S64 = LLT::scalar(64);
7574   const LLT S32 = LLT::scalar(32);
7575   const LLT S1 = LLT::scalar(1);
7576 
7577   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7578 
7579   // unsigned cul2f(ulong u) {
7580   //   uint lz = clz(u);
7581   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
7582   //   u = (u << lz) & 0x7fffffffffffffffUL;
7583   //   ulong t = u & 0xffffffffffUL;
7584   //   uint v = (e << 23) | (uint)(u >> 40);
7585   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
7586   //   return as_float(v + r);
7587   // }
7588 
7589   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
7590   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
7591 
7592   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
7593 
7594   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
7595   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
7596 
7597   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
7598   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
7599 
7600   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
7601   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
7602 
7603   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
7604 
7605   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
7606   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
7607 
7608   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
7609   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
7610   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
7611 
7612   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
7613   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
7614   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
7615   auto One = MIRBuilder.buildConstant(S32, 1);
7616 
7617   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
7618   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
7619   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
7620   MIRBuilder.buildAdd(Dst, V, R);
7621 
7622   MI.eraseFromParent();
7623   return Legalized;
7624 }
7625 
7626 // Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
7627 // operations and G_SITOFP
7628 LegalizerHelper::LegalizeResult
7629 LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
7630   auto [Dst, Src] = MI.getFirst2Regs();
7631   const LLT S64 = LLT::scalar(64);
7632   const LLT S32 = LLT::scalar(32);
7633   const LLT S1 = LLT::scalar(1);
7634 
7635   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
7636 
7637   // For i64 < INT_MAX we simply reuse SITOFP.
7638   // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
7639   // saved before division, convert to float by SITOFP, multiply the result
7640   // by 2.
7641   auto One = MIRBuilder.buildConstant(S64, 1);
7642   auto Zero = MIRBuilder.buildConstant(S64, 0);
7643   // Result if Src < INT_MAX
7644   auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
7645   // Result if Src >= INT_MAX
7646   auto Halved = MIRBuilder.buildLShr(S64, Src, One);
7647   auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
7648   auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
7649   auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
7650   auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
7651   // Check if the original value is larger than INT_MAX by comparing with
7652   // zero to pick one of the two conversions.
7653   auto IsLarge =
7654       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
7655   MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
7656 
7657   MI.eraseFromParent();
7658   return Legalized;
7659 }
7660 
7661 // Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
7662 // IEEE double representation.
7663 LegalizerHelper::LegalizeResult
7664 LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
7665   auto [Dst, Src] = MI.getFirst2Regs();
7666   const LLT S64 = LLT::scalar(64);
7667   const LLT S32 = LLT::scalar(32);
7668 
7669   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
7670 
7671   // We create double value from 32 bit parts with 32 exponent difference.
7672   // Note that + and - are float operations that adjust the implicit leading
7673   // one, the bases 2^52 and 2^84 are for illustrative purposes.
7674   //
7675   // X = 2^52 * 1.0...LowBits
7676   // Y = 2^84 * 1.0...HighBits
7677   // Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
7678   //         = - 2^52 * 1.0...HighBits
7679   // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
7680   auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
7681   auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
7682   auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
7683   auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
7684   auto HalfWidth = MIRBuilder.buildConstant(S64, 32);
7685 
7686   auto LowBits = MIRBuilder.buildTrunc(S32, Src);
7687   LowBits = MIRBuilder.buildZExt(S64, LowBits);
7688   auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
7689   auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
7690   auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
7691   auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
7692   MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
7693 
7694   MI.eraseFromParent();
7695   return Legalized;
7696 }
7697 
7698 /// i64->fp16 itofp can be lowered to i64->f64,f64->f32,f32->f16. We cannot
7699 /// convert fpround f64->f16 without double-rounding, so we manually perform the
7700 /// lowering here where we know it is valid.
7701 static LegalizerHelper::LegalizeResult
7702 loweri64tof16ITOFP(MachineInstr &MI, Register Dst, LLT DstTy, Register Src,
7703                    LLT SrcTy, MachineIRBuilder &MIRBuilder) {
7704   auto M1 = MI.getOpcode() == TargetOpcode::G_UITOFP
7705                 ? MIRBuilder.buildUITOFP(SrcTy, Src)
7706                 : MIRBuilder.buildSITOFP(SrcTy, Src);
7707   LLT S32Ty = SrcTy.changeElementSize(32);
7708   auto M2 = MIRBuilder.buildFPTrunc(S32Ty, M1);
7709   MIRBuilder.buildFPTrunc(Dst, M2);
7710   MI.eraseFromParent();
7711   return LegalizerHelper::Legalized;
7712 }
7713 
7714 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
7715   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7716 
7717   if (SrcTy == LLT::scalar(1)) {
7718     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
7719     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7720     MIRBuilder.buildSelect(Dst, Src, True, False);
7721     MI.eraseFromParent();
7722     return Legalized;
7723   }
7724 
7725   if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
7726     return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
7727 
7728   if (SrcTy != LLT::scalar(64))
7729     return UnableToLegalize;
7730 
7731   if (DstTy == LLT::scalar(32))
7732     // TODO: SelectionDAG has several alternative expansions to port which may
7733     // be more reasonable depending on the available instructions. We also need
7734     // a more advanced mechanism to choose an optimal version depending on
7735     // target features such as sitofp or CTLZ availability.
7736     return lowerU64ToF32WithSITOFP(MI);
7737 
7738   if (DstTy == LLT::scalar(64))
7739     return lowerU64ToF64BitFloatOps(MI);
7740 
7741   return UnableToLegalize;
7742 }
7743 
7744 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
7745   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7746 
7747   const LLT S64 = LLT::scalar(64);
7748   const LLT S32 = LLT::scalar(32);
7749   const LLT S1 = LLT::scalar(1);
7750 
7751   if (SrcTy == S1) {
7752     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
7753     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
7754     MIRBuilder.buildSelect(Dst, Src, True, False);
7755     MI.eraseFromParent();
7756     return Legalized;
7757   }
7758 
7759   if (DstTy.getScalarSizeInBits() == 16 && SrcTy.getScalarSizeInBits() == 64)
7760     return loweri64tof16ITOFP(MI, Dst, DstTy, Src, SrcTy, MIRBuilder);
7761 
7762   if (SrcTy != S64)
7763     return UnableToLegalize;
7764 
7765   if (DstTy == S32) {
7766     // signed cl2f(long l) {
7767     //   long s = l >> 63;
7768     //   float r = cul2f((l + s) ^ s);
7769     //   return s ? -r : r;
7770     // }
7771     Register L = Src;
7772     auto SignBit = MIRBuilder.buildConstant(S64, 63);
7773     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
7774 
7775     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
7776     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
7777     auto R = MIRBuilder.buildUITOFP(S32, Xor);
7778 
7779     auto RNeg = MIRBuilder.buildFNeg(S32, R);
7780     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
7781                                             MIRBuilder.buildConstant(S64, 0));
7782     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
7783     MI.eraseFromParent();
7784     return Legalized;
7785   }
7786 
7787   return UnableToLegalize;
7788 }
7789 
7790 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
7791   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7792   const LLT S64 = LLT::scalar(64);
7793   const LLT S32 = LLT::scalar(32);
7794 
7795   if (SrcTy != S64 && SrcTy != S32)
7796     return UnableToLegalize;
7797   if (DstTy != S32 && DstTy != S64)
7798     return UnableToLegalize;
7799 
7800   // FPTOSI gives same result as FPTOUI for positive signed integers.
7801   // FPTOUI needs to deal with fp values that convert to unsigned integers
7802   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
7803 
7804   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
7805   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
7806                                                 : APFloat::IEEEdouble(),
7807                     APInt::getZero(SrcTy.getSizeInBits()));
7808   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
7809 
7810   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
7811 
7812   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
7813   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
7814   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
7815   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
7816   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
7817   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
7818   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
7819 
7820   const LLT S1 = LLT::scalar(1);
7821 
7822   MachineInstrBuilder FCMP =
7823       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
7824   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
7825 
7826   MI.eraseFromParent();
7827   return Legalized;
7828 }
7829 
7830 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
7831   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7832   const LLT S64 = LLT::scalar(64);
7833   const LLT S32 = LLT::scalar(32);
7834 
7835   // FIXME: Only f32 to i64 conversions are supported.
7836   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
7837     return UnableToLegalize;
7838 
7839   // Expand f32 -> i64 conversion
7840   // This algorithm comes from compiler-rt's implementation of fixsfdi:
7841   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
7842 
7843   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
7844 
7845   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
7846   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
7847 
7848   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
7849   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
7850 
7851   auto SignMask = MIRBuilder.buildConstant(SrcTy,
7852                                            APInt::getSignMask(SrcEltBits));
7853   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
7854   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
7855   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
7856   Sign = MIRBuilder.buildSExt(DstTy, Sign);
7857 
7858   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
7859   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
7860   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
7861 
7862   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
7863   R = MIRBuilder.buildZExt(DstTy, R);
7864 
7865   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
7866   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
7867   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
7868   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
7869 
7870   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
7871   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
7872 
7873   const LLT S1 = LLT::scalar(1);
7874   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
7875                                     S1, Exponent, ExponentLoBit);
7876 
7877   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
7878 
7879   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
7880   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
7881 
7882   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
7883 
7884   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
7885                                           S1, Exponent, ZeroSrcTy);
7886 
7887   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
7888   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
7889 
7890   MI.eraseFromParent();
7891   return Legalized;
7892 }
7893 
7894 LegalizerHelper::LegalizeResult
7895 LegalizerHelper::lowerFPTOINT_SAT(MachineInstr &MI) {
7896   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
7897 
7898   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI_SAT;
7899   unsigned SatWidth = DstTy.getScalarSizeInBits();
7900 
7901   // Determine minimum and maximum integer values and their corresponding
7902   // floating-point values.
7903   APInt MinInt, MaxInt;
7904   if (IsSigned) {
7905     MinInt = APInt::getSignedMinValue(SatWidth);
7906     MaxInt = APInt::getSignedMaxValue(SatWidth);
7907   } else {
7908     MinInt = APInt::getMinValue(SatWidth);
7909     MaxInt = APInt::getMaxValue(SatWidth);
7910   }
7911 
7912   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7913   APFloat MinFloat(Semantics);
7914   APFloat MaxFloat(Semantics);
7915 
7916   APFloat::opStatus MinStatus =
7917       MinFloat.convertFromAPInt(MinInt, IsSigned, APFloat::rmTowardZero);
7918   APFloat::opStatus MaxStatus =
7919       MaxFloat.convertFromAPInt(MaxInt, IsSigned, APFloat::rmTowardZero);
7920   bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) &&
7921                              !(MaxStatus & APFloat::opStatus::opInexact);
7922 
7923   // If the integer bounds are exactly representable as floats, emit a
7924   // min+max+fptoi sequence. Otherwise we have to use a sequence of comparisons
7925   // and selects.
7926   if (AreExactFloatBounds) {
7927     // Clamp Src by MinFloat from below. If Src is NaN the result is MinFloat.
7928     auto MaxC = MIRBuilder.buildFConstant(SrcTy, MinFloat);
7929     auto MaxP = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT,
7930                                      SrcTy.changeElementSize(1), Src, MaxC);
7931     auto Max = MIRBuilder.buildSelect(SrcTy, MaxP, Src, MaxC);
7932     // Clamp by MaxFloat from above. NaN cannot occur.
7933     auto MinC = MIRBuilder.buildFConstant(SrcTy, MaxFloat);
7934     auto MinP =
7935         MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, SrcTy.changeElementSize(1), Max,
7936                              MinC, MachineInstr::FmNoNans);
7937     auto Min =
7938         MIRBuilder.buildSelect(SrcTy, MinP, Max, MinC, MachineInstr::FmNoNans);
7939     // Convert clamped value to integer. In the unsigned case we're done,
7940     // because we mapped NaN to MinFloat, which will cast to zero.
7941     if (!IsSigned) {
7942       MIRBuilder.buildFPTOUI(Dst, Min);
7943       MI.eraseFromParent();
7944       return Legalized;
7945     }
7946 
7947     // Otherwise, select 0 if Src is NaN.
7948     auto FpToInt = MIRBuilder.buildFPTOSI(DstTy, Min);
7949     auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7950                                        DstTy.changeElementSize(1), Src, Src);
7951     MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0),
7952                            FpToInt);
7953     MI.eraseFromParent();
7954     return Legalized;
7955   }
7956 
7957   // Result of direct conversion. The assumption here is that the operation is
7958   // non-trapping and it's fine to apply it to an out-of-range value if we
7959   // select it away later.
7960   auto FpToInt = IsSigned ? MIRBuilder.buildFPTOSI(DstTy, Src)
7961                           : MIRBuilder.buildFPTOUI(DstTy, Src);
7962 
7963   // If Src ULT MinFloat, select MinInt. In particular, this also selects
7964   // MinInt if Src is NaN.
7965   auto ULT =
7966       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, SrcTy.changeElementSize(1), Src,
7967                            MIRBuilder.buildFConstant(SrcTy, MinFloat));
7968   auto Max = MIRBuilder.buildSelect(
7969       DstTy, ULT, MIRBuilder.buildConstant(DstTy, MinInt), FpToInt);
7970   // If Src OGT MaxFloat, select MaxInt.
7971   auto OGT =
7972       MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, SrcTy.changeElementSize(1), Src,
7973                            MIRBuilder.buildFConstant(SrcTy, MaxFloat));
7974 
7975   // In the unsigned case we are done, because we mapped NaN to MinInt, which
7976   // is already zero.
7977   if (!IsSigned) {
7978     MIRBuilder.buildSelect(Dst, OGT, MIRBuilder.buildConstant(DstTy, MaxInt),
7979                            Max);
7980     MI.eraseFromParent();
7981     return Legalized;
7982   }
7983 
7984   // Otherwise, select 0 if Src is NaN.
7985   auto Min = MIRBuilder.buildSelect(
7986       DstTy, OGT, MIRBuilder.buildConstant(DstTy, MaxInt), Max);
7987   auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_UNO,
7988                                      DstTy.changeElementSize(1), Src, Src);
7989   MIRBuilder.buildSelect(Dst, IsZero, MIRBuilder.buildConstant(DstTy, 0), Min);
7990   MI.eraseFromParent();
7991   return Legalized;
7992 }
7993 
7994 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
7995 LegalizerHelper::LegalizeResult
7996 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
7997   const LLT S1 = LLT::scalar(1);
7998   const LLT S32 = LLT::scalar(32);
7999 
8000   auto [Dst, Src] = MI.getFirst2Regs();
8001   assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
8002          MRI.getType(Src).getScalarType() == LLT::scalar(64));
8003 
8004   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
8005     return UnableToLegalize;
8006 
8007   if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
8008     unsigned Flags = MI.getFlags();
8009     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
8010     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
8011     MI.eraseFromParent();
8012     return Legalized;
8013   }
8014 
8015   const unsigned ExpMask = 0x7ff;
8016   const unsigned ExpBiasf64 = 1023;
8017   const unsigned ExpBiasf16 = 15;
8018 
8019   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
8020   Register U = Unmerge.getReg(0);
8021   Register UH = Unmerge.getReg(1);
8022 
8023   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
8024   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
8025 
8026   // Subtract the fp64 exponent bias (1023) to get the real exponent and
8027   // add the f16 bias (15) to get the biased exponent for the f16 format.
8028   E = MIRBuilder.buildAdd(
8029     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
8030 
8031   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
8032   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
8033 
8034   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
8035                                        MIRBuilder.buildConstant(S32, 0x1ff));
8036   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
8037 
8038   auto Zero = MIRBuilder.buildConstant(S32, 0);
8039   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
8040   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
8041   M = MIRBuilder.buildOr(S32, M, Lo40Set);
8042 
8043   // (M != 0 ? 0x0200 : 0) | 0x7c00;
8044   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
8045   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
8046   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
8047 
8048   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
8049   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
8050 
8051   // N = M | (E << 12);
8052   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
8053   auto N = MIRBuilder.buildOr(S32, M, EShl12);
8054 
8055   // B = clamp(1-E, 0, 13);
8056   auto One = MIRBuilder.buildConstant(S32, 1);
8057   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
8058   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
8059   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
8060 
8061   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
8062                                        MIRBuilder.buildConstant(S32, 0x1000));
8063 
8064   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
8065   auto D0 = MIRBuilder.buildShl(S32, D, B);
8066 
8067   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
8068                                              D0, SigSetHigh);
8069   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
8070   D = MIRBuilder.buildOr(S32, D, D1);
8071 
8072   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
8073   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
8074 
8075   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
8076   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
8077 
8078   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
8079                                        MIRBuilder.buildConstant(S32, 3));
8080   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
8081 
8082   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
8083                                        MIRBuilder.buildConstant(S32, 5));
8084   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
8085 
8086   V1 = MIRBuilder.buildOr(S32, V0, V1);
8087   V = MIRBuilder.buildAdd(S32, V, V1);
8088 
8089   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
8090                                        E, MIRBuilder.buildConstant(S32, 30));
8091   V = MIRBuilder.buildSelect(S32, CmpEGt30,
8092                              MIRBuilder.buildConstant(S32, 0x7c00), V);
8093 
8094   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
8095                                          E, MIRBuilder.buildConstant(S32, 1039));
8096   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
8097 
8098   // Extract the sign bit.
8099   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
8100   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
8101 
8102   // Insert the sign bit
8103   V = MIRBuilder.buildOr(S32, Sign, V);
8104 
8105   MIRBuilder.buildTrunc(Dst, V);
8106   MI.eraseFromParent();
8107   return Legalized;
8108 }
8109 
8110 LegalizerHelper::LegalizeResult
8111 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
8112   auto [DstTy, SrcTy] = MI.getFirst2LLTs();
8113   const LLT S64 = LLT::scalar(64);
8114   const LLT S16 = LLT::scalar(16);
8115 
8116   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
8117     return lowerFPTRUNC_F64_TO_F16(MI);
8118 
8119   return UnableToLegalize;
8120 }
8121 
8122 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
8123   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8124   LLT Ty = MRI.getType(Dst);
8125 
8126   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
8127   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
8128   MI.eraseFromParent();
8129   return Legalized;
8130 }
8131 
8132 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
8133   switch (Opc) {
8134   case TargetOpcode::G_SMIN:
8135     return CmpInst::ICMP_SLT;
8136   case TargetOpcode::G_SMAX:
8137     return CmpInst::ICMP_SGT;
8138   case TargetOpcode::G_UMIN:
8139     return CmpInst::ICMP_ULT;
8140   case TargetOpcode::G_UMAX:
8141     return CmpInst::ICMP_UGT;
8142   default:
8143     llvm_unreachable("not in integer min/max");
8144   }
8145 }
8146 
8147 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
8148   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8149 
8150   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
8151   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
8152 
8153   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
8154   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
8155 
8156   MI.eraseFromParent();
8157   return Legalized;
8158 }
8159 
8160 LegalizerHelper::LegalizeResult
8161 LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) {
8162   GSUCmp *Cmp = cast<GSUCmp>(&MI);
8163 
8164   Register Dst = Cmp->getReg(0);
8165   LLT DstTy = MRI.getType(Dst);
8166   LLT SrcTy = MRI.getType(Cmp->getReg(1));
8167   LLT CmpTy = DstTy.changeElementSize(1);
8168 
8169   CmpInst::Predicate LTPredicate = Cmp->isSigned()
8170                                        ? CmpInst::Predicate::ICMP_SLT
8171                                        : CmpInst::Predicate::ICMP_ULT;
8172   CmpInst::Predicate GTPredicate = Cmp->isSigned()
8173                                        ? CmpInst::Predicate::ICMP_SGT
8174                                        : CmpInst::Predicate::ICMP_UGT;
8175 
8176   auto Zero = MIRBuilder.buildConstant(DstTy, 0);
8177   auto IsGT = MIRBuilder.buildICmp(GTPredicate, CmpTy, Cmp->getLHSReg(),
8178                                    Cmp->getRHSReg());
8179   auto IsLT = MIRBuilder.buildICmp(LTPredicate, CmpTy, Cmp->getLHSReg(),
8180                                    Cmp->getRHSReg());
8181 
8182   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
8183   auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false);
8184   if (TLI.shouldExpandCmpUsingSelects(getApproximateEVTForLLT(SrcTy, Ctx)) ||
8185       BC == TargetLowering::UndefinedBooleanContent) {
8186     auto One = MIRBuilder.buildConstant(DstTy, 1);
8187     auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero);
8188 
8189     auto MinusOne = MIRBuilder.buildConstant(DstTy, -1);
8190     MIRBuilder.buildSelect(Dst, IsLT, MinusOne, SelectZeroOrOne);
8191   } else {
8192     if (BC == TargetLowering::ZeroOrNegativeOneBooleanContent)
8193       std::swap(IsGT, IsLT);
8194     // Extend boolean results to DstTy, which is at least i2, before subtracting
8195     // them.
8196     unsigned BoolExtOp =
8197         MIRBuilder.getBoolExtOp(DstTy.isVector(), /*isFP=*/false);
8198     IsGT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsGT});
8199     IsLT = MIRBuilder.buildInstr(BoolExtOp, {DstTy}, {IsLT});
8200     MIRBuilder.buildSub(Dst, IsGT, IsLT);
8201   }
8202 
8203   MI.eraseFromParent();
8204   return Legalized;
8205 }
8206 
8207 LegalizerHelper::LegalizeResult
8208 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
8209   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
8210   const int Src0Size = Src0Ty.getScalarSizeInBits();
8211   const int Src1Size = Src1Ty.getScalarSizeInBits();
8212 
8213   auto SignBitMask = MIRBuilder.buildConstant(
8214     Src0Ty, APInt::getSignMask(Src0Size));
8215 
8216   auto NotSignBitMask = MIRBuilder.buildConstant(
8217     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
8218 
8219   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
8220   Register And1;
8221   if (Src0Ty == Src1Ty) {
8222     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
8223   } else if (Src0Size > Src1Size) {
8224     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
8225     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
8226     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
8227     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
8228   } else {
8229     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
8230     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
8231     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
8232     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
8233   }
8234 
8235   // Be careful about setting nsz/nnan/ninf on every instruction, since the
8236   // constants are a nan and -0.0, but the final result should preserve
8237   // everything.
8238   unsigned Flags = MI.getFlags();
8239 
8240   // We masked the sign bit and the not-sign bit, so these are disjoint.
8241   Flags |= MachineInstr::Disjoint;
8242 
8243   MIRBuilder.buildOr(Dst, And0, And1, Flags);
8244 
8245   MI.eraseFromParent();
8246   return Legalized;
8247 }
8248 
8249 LegalizerHelper::LegalizeResult
8250 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
8251   // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
8252   // identical handling. fminimumnum/fmaximumnum also need a path that do not
8253   // depend on fminnum/fmaxnum.
8254 
8255   unsigned NewOp;
8256   switch (MI.getOpcode()) {
8257   case TargetOpcode::G_FMINNUM:
8258     NewOp = TargetOpcode::G_FMINNUM_IEEE;
8259     break;
8260   case TargetOpcode::G_FMINIMUMNUM:
8261     NewOp = TargetOpcode::G_FMINNUM;
8262     break;
8263   case TargetOpcode::G_FMAXNUM:
8264     NewOp = TargetOpcode::G_FMAXNUM_IEEE;
8265     break;
8266   case TargetOpcode::G_FMAXIMUMNUM:
8267     NewOp = TargetOpcode::G_FMAXNUM;
8268     break;
8269   default:
8270     llvm_unreachable("unexpected min/max opcode");
8271   }
8272 
8273   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
8274   LLT Ty = MRI.getType(Dst);
8275 
8276   if (!MI.getFlag(MachineInstr::FmNoNans)) {
8277     // Insert canonicalizes if it's possible we need to quiet to get correct
8278     // sNaN behavior.
8279 
8280     // Note this must be done here, and not as an optimization combine in the
8281     // absence of a dedicate quiet-snan instruction as we're using an
8282     // omni-purpose G_FCANONICALIZE.
8283     if (!isKnownNeverSNaN(Src0, MRI))
8284       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
8285 
8286     if (!isKnownNeverSNaN(Src1, MRI))
8287       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
8288   }
8289 
8290   // If there are no nans, it's safe to simply replace this with the non-IEEE
8291   // version.
8292   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
8293   MI.eraseFromParent();
8294   return Legalized;
8295 }
8296 
8297 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
8298   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
8299   Register DstReg = MI.getOperand(0).getReg();
8300   LLT Ty = MRI.getType(DstReg);
8301   unsigned Flags = MI.getFlags();
8302 
8303   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
8304                                   Flags);
8305   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
8306   MI.eraseFromParent();
8307   return Legalized;
8308 }
8309 
8310 LegalizerHelper::LegalizeResult
8311 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
8312   auto [DstReg, X] = MI.getFirst2Regs();
8313   const unsigned Flags = MI.getFlags();
8314   const LLT Ty = MRI.getType(DstReg);
8315   const LLT CondTy = Ty.changeElementSize(1);
8316 
8317   // round(x) =>
8318   //  t = trunc(x);
8319   //  d = fabs(x - t);
8320   //  o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
8321   //  return t + o;
8322 
8323   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
8324 
8325   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
8326   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
8327 
8328   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
8329   auto Cmp =
8330       MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
8331 
8332   // Could emit G_UITOFP instead
8333   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
8334   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8335   auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
8336   auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
8337 
8338   MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
8339 
8340   MI.eraseFromParent();
8341   return Legalized;
8342 }
8343 
8344 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
8345   auto [DstReg, SrcReg] = MI.getFirst2Regs();
8346   unsigned Flags = MI.getFlags();
8347   LLT Ty = MRI.getType(DstReg);
8348   const LLT CondTy = Ty.changeElementSize(1);
8349 
8350   // result = trunc(src);
8351   // if (src < 0.0 && src != result)
8352   //   result += -1.0.
8353 
8354   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
8355   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
8356 
8357   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
8358                                   SrcReg, Zero, Flags);
8359   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
8360                                       SrcReg, Trunc, Flags);
8361   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
8362   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
8363 
8364   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
8365   MI.eraseFromParent();
8366   return Legalized;
8367 }
8368 
8369 LegalizerHelper::LegalizeResult
8370 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
8371   const unsigned NumOps = MI.getNumOperands();
8372   auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
8373   unsigned PartSize = Src0Ty.getSizeInBits();
8374 
8375   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
8376   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
8377 
8378   for (unsigned I = 2; I != NumOps; ++I) {
8379     const unsigned Offset = (I - 1) * PartSize;
8380 
8381     Register SrcReg = MI.getOperand(I).getReg();
8382     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
8383 
8384     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
8385       MRI.createGenericVirtualRegister(WideTy);
8386 
8387     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
8388     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
8389     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
8390     ResultReg = NextResult;
8391   }
8392 
8393   if (DstTy.isPointer()) {
8394     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
8395           DstTy.getAddressSpace())) {
8396       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
8397       return UnableToLegalize;
8398     }
8399 
8400     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
8401   }
8402 
8403   MI.eraseFromParent();
8404   return Legalized;
8405 }
8406 
8407 LegalizerHelper::LegalizeResult
8408 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
8409   const unsigned NumDst = MI.getNumOperands() - 1;
8410   Register SrcReg = MI.getOperand(NumDst).getReg();
8411   Register Dst0Reg = MI.getOperand(0).getReg();
8412   LLT DstTy = MRI.getType(Dst0Reg);
8413   if (DstTy.isPointer())
8414     return UnableToLegalize; // TODO
8415 
8416   SrcReg = coerceToScalar(SrcReg);
8417   if (!SrcReg)
8418     return UnableToLegalize;
8419 
8420   // Expand scalarizing unmerge as bitcast to integer and shift.
8421   LLT IntTy = MRI.getType(SrcReg);
8422 
8423   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
8424 
8425   const unsigned DstSize = DstTy.getSizeInBits();
8426   unsigned Offset = DstSize;
8427   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
8428     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
8429     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
8430     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
8431   }
8432 
8433   MI.eraseFromParent();
8434   return Legalized;
8435 }
8436 
8437 /// Lower a vector extract or insert by writing the vector to a stack temporary
8438 /// and reloading the element or vector.
8439 ///
8440 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
8441 ///  =>
8442 ///  %stack_temp = G_FRAME_INDEX
8443 ///  G_STORE %vec, %stack_temp
8444 ///  %idx = clamp(%idx, %vec.getNumElements())
8445 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
8446 ///  %dst = G_LOAD %element_ptr
8447 LegalizerHelper::LegalizeResult
8448 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
8449   Register DstReg = MI.getOperand(0).getReg();
8450   Register SrcVec = MI.getOperand(1).getReg();
8451   Register InsertVal;
8452   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
8453     InsertVal = MI.getOperand(2).getReg();
8454 
8455   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
8456 
8457   LLT VecTy = MRI.getType(SrcVec);
8458   LLT EltTy = VecTy.getElementType();
8459   unsigned NumElts = VecTy.getNumElements();
8460 
8461   int64_t IdxVal;
8462   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
8463     SmallVector<Register, 8> SrcRegs;
8464     extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
8465 
8466     if (InsertVal) {
8467       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
8468       MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
8469     } else {
8470       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
8471     }
8472 
8473     MI.eraseFromParent();
8474     return Legalized;
8475   }
8476 
8477   if (!EltTy.isByteSized()) { // Not implemented.
8478     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
8479     return UnableToLegalize;
8480   }
8481 
8482   unsigned EltBytes = EltTy.getSizeInBytes();
8483   Align VecAlign = getStackTemporaryAlignment(VecTy);
8484   Align EltAlign;
8485 
8486   MachinePointerInfo PtrInfo;
8487   auto StackTemp = createStackTemporary(
8488       TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
8489   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
8490 
8491   // Get the pointer to the element, and be sure not to hit undefined behavior
8492   // if the index is out of bounds.
8493   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
8494 
8495   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
8496     int64_t Offset = IdxVal * EltBytes;
8497     PtrInfo = PtrInfo.getWithOffset(Offset);
8498     EltAlign = commonAlignment(VecAlign, Offset);
8499   } else {
8500     // We lose information with a variable offset.
8501     EltAlign = getStackTemporaryAlignment(EltTy);
8502     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
8503   }
8504 
8505   if (InsertVal) {
8506     // Write the inserted element
8507     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
8508 
8509     // Reload the whole vector.
8510     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
8511   } else {
8512     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
8513   }
8514 
8515   MI.eraseFromParent();
8516   return Legalized;
8517 }
8518 
8519 LegalizerHelper::LegalizeResult
8520 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
8521   auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
8522       MI.getFirst3RegLLTs();
8523   LLT IdxTy = LLT::scalar(32);
8524 
8525   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
8526   Register Undef;
8527   SmallVector<Register, 32> BuildVec;
8528   LLT EltTy = DstTy.getScalarType();
8529 
8530   for (int Idx : Mask) {
8531     if (Idx < 0) {
8532       if (!Undef.isValid())
8533         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
8534       BuildVec.push_back(Undef);
8535       continue;
8536     }
8537 
8538     if (Src0Ty.isScalar()) {
8539       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
8540     } else {
8541       int NumElts = Src0Ty.getNumElements();
8542       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
8543       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
8544       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
8545       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
8546       BuildVec.push_back(Extract.getReg(0));
8547     }
8548   }
8549 
8550   if (DstTy.isVector())
8551     MIRBuilder.buildBuildVector(DstReg, BuildVec);
8552   else
8553     MIRBuilder.buildCopy(DstReg, BuildVec[0]);
8554   MI.eraseFromParent();
8555   return Legalized;
8556 }
8557 
8558 LegalizerHelper::LegalizeResult
8559 LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
8560   auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
8561       MI.getFirst4RegLLTs();
8562 
8563   if (VecTy.isScalableVector())
8564     report_fatal_error("Cannot expand masked_compress for scalable vectors.");
8565 
8566   Align VecAlign = getStackTemporaryAlignment(VecTy);
8567   MachinePointerInfo PtrInfo;
8568   Register StackPtr =
8569       createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
8570                            PtrInfo)
8571           .getReg(0);
8572   MachinePointerInfo ValPtrInfo =
8573       MachinePointerInfo::getUnknownStack(*MI.getMF());
8574 
8575   LLT IdxTy = LLT::scalar(32);
8576   LLT ValTy = VecTy.getElementType();
8577   Align ValAlign = getStackTemporaryAlignment(ValTy);
8578 
8579   auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
8580 
8581   bool HasPassthru =
8582       MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
8583 
8584   if (HasPassthru)
8585     MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
8586 
8587   Register LastWriteVal;
8588   std::optional<APInt> PassthruSplatVal =
8589       isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
8590 
8591   if (PassthruSplatVal.has_value()) {
8592     LastWriteVal =
8593         MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
8594   } else if (HasPassthru) {
8595     auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
8596     Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
8597                                      {LLT::scalar(32)}, {Popcount});
8598 
8599     Register LastElmtPtr =
8600         getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
8601     LastWriteVal =
8602         MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign)
8603             .getReg(0);
8604   }
8605 
8606   unsigned NumElmts = VecTy.getNumElements();
8607   for (unsigned I = 0; I < NumElmts; ++I) {
8608     auto Idx = MIRBuilder.buildConstant(IdxTy, I);
8609     auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
8610     Register ElmtPtr =
8611         getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8612     MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign);
8613 
8614     LLT MaskITy = MaskTy.getElementType();
8615     auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
8616     if (MaskITy.getSizeInBits() > 1)
8617       MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
8618 
8619     MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
8620     OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
8621 
8622     if (HasPassthru && I == NumElmts - 1) {
8623       auto EndOfVector =
8624           MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
8625       auto AllLanesSelected = MIRBuilder.buildICmp(
8626           CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
8627       OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
8628                                      {OutPos, EndOfVector});
8629       ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
8630 
8631       LastWriteVal =
8632           MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal)
8633               .getReg(0);
8634       MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
8635     }
8636   }
8637 
8638   // TODO: Use StackPtr's FrameIndex alignment.
8639   MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
8640 
8641   MI.eraseFromParent();
8642   return Legalized;
8643 }
8644 
8645 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
8646                                                     Register AllocSize,
8647                                                     Align Alignment,
8648                                                     LLT PtrTy) {
8649   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
8650 
8651   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
8652   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
8653 
8654   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
8655   // have to generate an extra instruction to negate the alloc and then use
8656   // G_PTR_ADD to add the negative offset.
8657   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
8658   if (Alignment > Align(1)) {
8659     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
8660     AlignMask.negate();
8661     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
8662     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
8663   }
8664 
8665   return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
8666 }
8667 
8668 LegalizerHelper::LegalizeResult
8669 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
8670   const auto &MF = *MI.getMF();
8671   const auto &TFI = *MF.getSubtarget().getFrameLowering();
8672   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
8673     return UnableToLegalize;
8674 
8675   Register Dst = MI.getOperand(0).getReg();
8676   Register AllocSize = MI.getOperand(1).getReg();
8677   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
8678 
8679   LLT PtrTy = MRI.getType(Dst);
8680   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
8681   Register SPTmp =
8682       getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
8683 
8684   MIRBuilder.buildCopy(SPReg, SPTmp);
8685   MIRBuilder.buildCopy(Dst, SPTmp);
8686 
8687   MI.eraseFromParent();
8688   return Legalized;
8689 }
8690 
8691 LegalizerHelper::LegalizeResult
8692 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
8693   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8694   if (!StackPtr)
8695     return UnableToLegalize;
8696 
8697   MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
8698   MI.eraseFromParent();
8699   return Legalized;
8700 }
8701 
8702 LegalizerHelper::LegalizeResult
8703 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
8704   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
8705   if (!StackPtr)
8706     return UnableToLegalize;
8707 
8708   MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
8709   MI.eraseFromParent();
8710   return Legalized;
8711 }
8712 
8713 LegalizerHelper::LegalizeResult
8714 LegalizerHelper::lowerExtract(MachineInstr &MI) {
8715   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
8716   unsigned Offset = MI.getOperand(2).getImm();
8717 
8718   // Extract sub-vector or one element
8719   if (SrcTy.isVector()) {
8720     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
8721     unsigned DstSize = DstTy.getSizeInBits();
8722 
8723     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
8724         (Offset + DstSize <= SrcTy.getSizeInBits())) {
8725       // Unmerge and allow access to each Src element for the artifact combiner.
8726       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
8727 
8728       // Take element(s) we need to extract and copy it (merge them).
8729       SmallVector<Register, 8> SubVectorElts;
8730       for (unsigned Idx = Offset / SrcEltSize;
8731            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
8732         SubVectorElts.push_back(Unmerge.getReg(Idx));
8733       }
8734       if (SubVectorElts.size() == 1)
8735         MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
8736       else
8737         MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
8738 
8739       MI.eraseFromParent();
8740       return Legalized;
8741     }
8742   }
8743 
8744   if (DstTy.isScalar() &&
8745       (SrcTy.isScalar() ||
8746        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
8747     LLT SrcIntTy = SrcTy;
8748     if (!SrcTy.isScalar()) {
8749       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
8750       SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
8751     }
8752 
8753     if (Offset == 0)
8754       MIRBuilder.buildTrunc(DstReg, SrcReg);
8755     else {
8756       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
8757       auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
8758       MIRBuilder.buildTrunc(DstReg, Shr);
8759     }
8760 
8761     MI.eraseFromParent();
8762     return Legalized;
8763   }
8764 
8765   return UnableToLegalize;
8766 }
8767 
8768 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
8769   auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
8770   uint64_t Offset = MI.getOperand(3).getImm();
8771 
8772   LLT DstTy = MRI.getType(Src);
8773   LLT InsertTy = MRI.getType(InsertSrc);
8774 
8775   // Insert sub-vector or one element
8776   if (DstTy.isVector() && !InsertTy.isPointer()) {
8777     LLT EltTy = DstTy.getElementType();
8778     unsigned EltSize = EltTy.getSizeInBits();
8779     unsigned InsertSize = InsertTy.getSizeInBits();
8780 
8781     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
8782         (Offset + InsertSize <= DstTy.getSizeInBits())) {
8783       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
8784       SmallVector<Register, 8> DstElts;
8785       unsigned Idx = 0;
8786       // Elements from Src before insert start Offset
8787       for (; Idx < Offset / EltSize; ++Idx) {
8788         DstElts.push_back(UnmergeSrc.getReg(Idx));
8789       }
8790 
8791       // Replace elements in Src with elements from InsertSrc
8792       if (InsertTy.getSizeInBits() > EltSize) {
8793         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
8794         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
8795              ++Idx, ++i) {
8796           DstElts.push_back(UnmergeInsertSrc.getReg(i));
8797         }
8798       } else {
8799         DstElts.push_back(InsertSrc);
8800         ++Idx;
8801       }
8802 
8803       // Remaining elements from Src after insert
8804       for (; Idx < DstTy.getNumElements(); ++Idx) {
8805         DstElts.push_back(UnmergeSrc.getReg(Idx));
8806       }
8807 
8808       MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
8809       MI.eraseFromParent();
8810       return Legalized;
8811     }
8812   }
8813 
8814   if (InsertTy.isVector() ||
8815       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
8816     return UnableToLegalize;
8817 
8818   const DataLayout &DL = MIRBuilder.getDataLayout();
8819   if ((DstTy.isPointer() &&
8820        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
8821       (InsertTy.isPointer() &&
8822        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
8823     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
8824     return UnableToLegalize;
8825   }
8826 
8827   LLT IntDstTy = DstTy;
8828 
8829   if (!DstTy.isScalar()) {
8830     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
8831     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
8832   }
8833 
8834   if (!InsertTy.isScalar()) {
8835     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
8836     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
8837   }
8838 
8839   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
8840   if (Offset != 0) {
8841     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
8842     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
8843   }
8844 
8845   APInt MaskVal = APInt::getBitsSetWithWrap(
8846       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
8847 
8848   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
8849   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
8850   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
8851 
8852   MIRBuilder.buildCast(Dst, Or);
8853   MI.eraseFromParent();
8854   return Legalized;
8855 }
8856 
8857 LegalizerHelper::LegalizeResult
8858 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
8859   auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
8860       MI.getFirst4RegLLTs();
8861   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
8862 
8863   LLT Ty = Dst0Ty;
8864   LLT BoolTy = Dst1Ty;
8865 
8866   Register NewDst0 = MRI.cloneVirtualRegister(Dst0);
8867 
8868   if (IsAdd)
8869     MIRBuilder.buildAdd(NewDst0, LHS, RHS);
8870   else
8871     MIRBuilder.buildSub(NewDst0, LHS, RHS);
8872 
8873   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
8874 
8875   auto Zero = MIRBuilder.buildConstant(Ty, 0);
8876 
8877   // For an addition, the result should be less than one of the operands (LHS)
8878   // if and only if the other operand (RHS) is negative, otherwise there will
8879   // be overflow.
8880   // For a subtraction, the result should be less than one of the operands
8881   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
8882   // otherwise there will be overflow.
8883   auto ResultLowerThanLHS =
8884       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, NewDst0, LHS);
8885   auto ConditionRHS = MIRBuilder.buildICmp(
8886       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
8887 
8888   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
8889 
8890   MIRBuilder.buildCopy(Dst0, NewDst0);
8891   MI.eraseFromParent();
8892 
8893   return Legalized;
8894 }
8895 
8896 LegalizerHelper::LegalizeResult
8897 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
8898   auto [Res, LHS, RHS] = MI.getFirst3Regs();
8899   LLT Ty = MRI.getType(Res);
8900   bool IsSigned;
8901   bool IsAdd;
8902   unsigned BaseOp;
8903   switch (MI.getOpcode()) {
8904   default:
8905     llvm_unreachable("unexpected addsat/subsat opcode");
8906   case TargetOpcode::G_UADDSAT:
8907     IsSigned = false;
8908     IsAdd = true;
8909     BaseOp = TargetOpcode::G_ADD;
8910     break;
8911   case TargetOpcode::G_SADDSAT:
8912     IsSigned = true;
8913     IsAdd = true;
8914     BaseOp = TargetOpcode::G_ADD;
8915     break;
8916   case TargetOpcode::G_USUBSAT:
8917     IsSigned = false;
8918     IsAdd = false;
8919     BaseOp = TargetOpcode::G_SUB;
8920     break;
8921   case TargetOpcode::G_SSUBSAT:
8922     IsSigned = true;
8923     IsAdd = false;
8924     BaseOp = TargetOpcode::G_SUB;
8925     break;
8926   }
8927 
8928   if (IsSigned) {
8929     // sadd.sat(a, b) ->
8930     //   hi = 0x7fffffff - smax(a, 0)
8931     //   lo = 0x80000000 - smin(a, 0)
8932     //   a + smin(smax(lo, b), hi)
8933     // ssub.sat(a, b) ->
8934     //   lo = smax(a, -1) - 0x7fffffff
8935     //   hi = smin(a, -1) - 0x80000000
8936     //   a - smin(smax(lo, b), hi)
8937     // TODO: AMDGPU can use a "median of 3" instruction here:
8938     //   a +/- med3(lo, b, hi)
8939     uint64_t NumBits = Ty.getScalarSizeInBits();
8940     auto MaxVal =
8941         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
8942     auto MinVal =
8943         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
8944     MachineInstrBuilder Hi, Lo;
8945     if (IsAdd) {
8946       auto Zero = MIRBuilder.buildConstant(Ty, 0);
8947       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
8948       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
8949     } else {
8950       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
8951       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
8952                                MaxVal);
8953       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
8954                                MinVal);
8955     }
8956     auto RHSClamped =
8957         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
8958     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
8959   } else {
8960     // uadd.sat(a, b) -> a + umin(~a, b)
8961     // usub.sat(a, b) -> a - umin(a, b)
8962     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
8963     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
8964     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
8965   }
8966 
8967   MI.eraseFromParent();
8968   return Legalized;
8969 }
8970 
8971 LegalizerHelper::LegalizeResult
8972 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
8973   auto [Res, LHS, RHS] = MI.getFirst3Regs();
8974   LLT Ty = MRI.getType(Res);
8975   LLT BoolTy = Ty.changeElementSize(1);
8976   bool IsSigned;
8977   bool IsAdd;
8978   unsigned OverflowOp;
8979   switch (MI.getOpcode()) {
8980   default:
8981     llvm_unreachable("unexpected addsat/subsat opcode");
8982   case TargetOpcode::G_UADDSAT:
8983     IsSigned = false;
8984     IsAdd = true;
8985     OverflowOp = TargetOpcode::G_UADDO;
8986     break;
8987   case TargetOpcode::G_SADDSAT:
8988     IsSigned = true;
8989     IsAdd = true;
8990     OverflowOp = TargetOpcode::G_SADDO;
8991     break;
8992   case TargetOpcode::G_USUBSAT:
8993     IsSigned = false;
8994     IsAdd = false;
8995     OverflowOp = TargetOpcode::G_USUBO;
8996     break;
8997   case TargetOpcode::G_SSUBSAT:
8998     IsSigned = true;
8999     IsAdd = false;
9000     OverflowOp = TargetOpcode::G_SSUBO;
9001     break;
9002   }
9003 
9004   auto OverflowRes =
9005       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
9006   Register Tmp = OverflowRes.getReg(0);
9007   Register Ov = OverflowRes.getReg(1);
9008   MachineInstrBuilder Clamp;
9009   if (IsSigned) {
9010     // sadd.sat(a, b) ->
9011     //   {tmp, ov} = saddo(a, b)
9012     //   ov ? (tmp >>s 31) + 0x80000000 : r
9013     // ssub.sat(a, b) ->
9014     //   {tmp, ov} = ssubo(a, b)
9015     //   ov ? (tmp >>s 31) + 0x80000000 : r
9016     uint64_t NumBits = Ty.getScalarSizeInBits();
9017     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
9018     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
9019     auto MinVal =
9020         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
9021     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
9022   } else {
9023     // uadd.sat(a, b) ->
9024     //   {tmp, ov} = uaddo(a, b)
9025     //   ov ? 0xffffffff : tmp
9026     // usub.sat(a, b) ->
9027     //   {tmp, ov} = usubo(a, b)
9028     //   ov ? 0 : tmp
9029     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
9030   }
9031   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
9032 
9033   MI.eraseFromParent();
9034   return Legalized;
9035 }
9036 
9037 LegalizerHelper::LegalizeResult
9038 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
9039   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
9040           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
9041          "Expected shlsat opcode!");
9042   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
9043   auto [Res, LHS, RHS] = MI.getFirst3Regs();
9044   LLT Ty = MRI.getType(Res);
9045   LLT BoolTy = Ty.changeElementSize(1);
9046 
9047   unsigned BW = Ty.getScalarSizeInBits();
9048   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
9049   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
9050                        : MIRBuilder.buildLShr(Ty, Result, RHS);
9051 
9052   MachineInstrBuilder SatVal;
9053   if (IsSigned) {
9054     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
9055     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
9056     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
9057                                     MIRBuilder.buildConstant(Ty, 0));
9058     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
9059   } else {
9060     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
9061   }
9062   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
9063   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
9064 
9065   MI.eraseFromParent();
9066   return Legalized;
9067 }
9068 
9069 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
9070   auto [Dst, Src] = MI.getFirst2Regs();
9071   const LLT Ty = MRI.getType(Src);
9072   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
9073   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
9074 
9075   // Swap most and least significant byte, set remaining bytes in Res to zero.
9076   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
9077   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
9078   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
9079   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
9080 
9081   // Set i-th high/low byte in Res to i-th low/high byte from Src.
9082   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
9083     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
9084     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
9085     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
9086     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
9087     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
9088     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
9089     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
9090     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
9091     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
9092     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
9093     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
9094     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
9095   }
9096   Res.getInstr()->getOperand(0).setReg(Dst);
9097 
9098   MI.eraseFromParent();
9099   return Legalized;
9100 }
9101 
9102 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
9103 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
9104                                  MachineInstrBuilder Src, const APInt &Mask) {
9105   const LLT Ty = Dst.getLLTTy(*B.getMRI());
9106   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
9107   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
9108   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
9109   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
9110   return B.buildOr(Dst, LHS, RHS);
9111 }
9112 
9113 LegalizerHelper::LegalizeResult
9114 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
9115   auto [Dst, Src] = MI.getFirst2Regs();
9116   const LLT SrcTy = MRI.getType(Src);
9117   unsigned Size = SrcTy.getScalarSizeInBits();
9118   unsigned VSize = SrcTy.getSizeInBits();
9119 
9120   if (Size >= 8) {
9121     if (SrcTy.isVector() && (VSize % 8 == 0) &&
9122         (LI.isLegal({TargetOpcode::G_BITREVERSE,
9123                      {LLT::fixed_vector(VSize / 8, 8),
9124                       LLT::fixed_vector(VSize / 8, 8)}}))) {
9125       // If bitreverse is legal for i8 vector of the same size, then cast
9126       // to i8 vector type.
9127       // e.g. v4s32 -> v16s8
9128       LLT VTy = LLT::fixed_vector(VSize / 8, 8);
9129       auto BSWAP = MIRBuilder.buildBSwap(SrcTy, Src);
9130       auto Cast = MIRBuilder.buildBitcast(VTy, BSWAP);
9131       auto RBIT = MIRBuilder.buildBitReverse(VTy, Cast);
9132       MIRBuilder.buildBitcast(Dst, RBIT);
9133     } else {
9134       MachineInstrBuilder BSWAP =
9135           MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {SrcTy}, {Src});
9136 
9137       // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
9138       //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
9139       // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
9140       MachineInstrBuilder Swap4 = SwapN(4, SrcTy, MIRBuilder, BSWAP,
9141                                         APInt::getSplat(Size, APInt(8, 0xF0)));
9142 
9143       // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
9144       //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
9145       // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
9146       MachineInstrBuilder Swap2 = SwapN(2, SrcTy, MIRBuilder, Swap4,
9147                                         APInt::getSplat(Size, APInt(8, 0xCC)));
9148 
9149       // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5
9150       // 6|7
9151       //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
9152       // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
9153       SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
9154     }
9155   } else {
9156     // Expand bitreverse for types smaller than 8 bits.
9157     MachineInstrBuilder Tmp;
9158     for (unsigned I = 0, J = Size - 1; I < Size; ++I, --J) {
9159       MachineInstrBuilder Tmp2;
9160       if (I < J) {
9161         auto ShAmt = MIRBuilder.buildConstant(SrcTy, J - I);
9162         Tmp2 = MIRBuilder.buildShl(SrcTy, Src, ShAmt);
9163       } else {
9164         auto ShAmt = MIRBuilder.buildConstant(SrcTy, I - J);
9165         Tmp2 = MIRBuilder.buildLShr(SrcTy, Src, ShAmt);
9166       }
9167 
9168       auto Mask = MIRBuilder.buildConstant(SrcTy, 1ULL << J);
9169       Tmp2 = MIRBuilder.buildAnd(SrcTy, Tmp2, Mask);
9170       if (I == 0)
9171         Tmp = Tmp2;
9172       else
9173         Tmp = MIRBuilder.buildOr(SrcTy, Tmp, Tmp2);
9174     }
9175     MIRBuilder.buildCopy(Dst, Tmp);
9176   }
9177 
9178   MI.eraseFromParent();
9179   return Legalized;
9180 }
9181 
9182 LegalizerHelper::LegalizeResult
9183 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
9184   MachineFunction &MF = MIRBuilder.getMF();
9185 
9186   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
9187   int NameOpIdx = IsRead ? 1 : 0;
9188   int ValRegIndex = IsRead ? 0 : 1;
9189 
9190   Register ValReg = MI.getOperand(ValRegIndex).getReg();
9191   const LLT Ty = MRI.getType(ValReg);
9192   const MDString *RegStr = cast<MDString>(
9193     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
9194 
9195   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
9196   if (!PhysReg) {
9197     const Function &Fn = MF.getFunction();
9198     Fn.getContext().diagnose(DiagnosticInfoGenericWithLoc(
9199         "invalid register \"" + Twine(RegStr->getString().data()) + "\" for " +
9200             (IsRead ? "llvm.read_register" : "llvm.write_register"),
9201         Fn, MI.getDebugLoc()));
9202     if (IsRead)
9203       MIRBuilder.buildUndef(ValReg);
9204 
9205     MI.eraseFromParent();
9206     return Legalized;
9207   }
9208 
9209   if (IsRead)
9210     MIRBuilder.buildCopy(ValReg, PhysReg);
9211   else
9212     MIRBuilder.buildCopy(PhysReg, ValReg);
9213 
9214   MI.eraseFromParent();
9215   return Legalized;
9216 }
9217 
9218 LegalizerHelper::LegalizeResult
9219 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
9220   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
9221   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
9222   Register Result = MI.getOperand(0).getReg();
9223   LLT OrigTy = MRI.getType(Result);
9224   auto SizeInBits = OrigTy.getScalarSizeInBits();
9225   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
9226 
9227   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
9228   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
9229   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
9230   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
9231 
9232   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
9233   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
9234   MIRBuilder.buildTrunc(Result, Shifted);
9235 
9236   MI.eraseFromParent();
9237   return Legalized;
9238 }
9239 
9240 LegalizerHelper::LegalizeResult
9241 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
9242   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
9243   FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
9244 
9245   if (Mask == fcNone) {
9246     MIRBuilder.buildConstant(DstReg, 0);
9247     MI.eraseFromParent();
9248     return Legalized;
9249   }
9250   if (Mask == fcAllFlags) {
9251     MIRBuilder.buildConstant(DstReg, 1);
9252     MI.eraseFromParent();
9253     return Legalized;
9254   }
9255 
9256   // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
9257   // version
9258 
9259   unsigned BitSize = SrcTy.getScalarSizeInBits();
9260   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
9261 
9262   LLT IntTy = LLT::scalar(BitSize);
9263   if (SrcTy.isVector())
9264     IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
9265   auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
9266 
9267   // Various masks.
9268   APInt SignBit = APInt::getSignMask(BitSize);
9269   APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
9270   APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
9271   APInt ExpMask = Inf;
9272   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
9273   APInt QNaNBitMask =
9274       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
9275   APInt InversionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
9276 
9277   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
9278   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
9279   auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
9280   auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
9281   auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
9282 
9283   auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
9284   auto Sign =
9285       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
9286 
9287   auto Res = MIRBuilder.buildConstant(DstTy, 0);
9288   // Clang doesn't support capture of structured bindings:
9289   LLT DstTyCopy = DstTy;
9290   const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
9291     Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
9292   };
9293 
9294   // Tests that involve more than one class should be processed first.
9295   if ((Mask & fcFinite) == fcFinite) {
9296     // finite(V) ==> abs(V) u< exp_mask
9297     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9298                                      ExpMaskC));
9299     Mask &= ~fcFinite;
9300   } else if ((Mask & fcFinite) == fcPosFinite) {
9301     // finite(V) && V > 0 ==> V u< exp_mask
9302     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
9303                                      ExpMaskC));
9304     Mask &= ~fcPosFinite;
9305   } else if ((Mask & fcFinite) == fcNegFinite) {
9306     // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
9307     auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
9308                                     ExpMaskC);
9309     auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
9310     appendToRes(And);
9311     Mask &= ~fcNegFinite;
9312   }
9313 
9314   if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
9315     // fcZero | fcSubnormal => test all exponent bits are 0
9316     // TODO: Handle sign bit specific cases
9317     // TODO: Handle inverted case
9318     if (PartialCheck == (fcZero | fcSubnormal)) {
9319       auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
9320       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9321                                        ExpBits, ZeroC));
9322       Mask &= ~PartialCheck;
9323     }
9324   }
9325 
9326   // Check for individual classes.
9327   if (FPClassTest PartialCheck = Mask & fcZero) {
9328     if (PartialCheck == fcPosZero)
9329       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9330                                        AsInt, ZeroC));
9331     else if (PartialCheck == fcZero)
9332       appendToRes(
9333           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
9334     else // fcNegZero
9335       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9336                                        AsInt, SignBitC));
9337   }
9338 
9339   if (FPClassTest PartialCheck = Mask & fcSubnormal) {
9340     // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
9341     // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
9342     auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
9343     auto OneC = MIRBuilder.buildConstant(IntTy, 1);
9344     auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
9345     auto SubnormalRes =
9346         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
9347                              MIRBuilder.buildConstant(IntTy, AllOneMantissa));
9348     if (PartialCheck == fcNegSubnormal)
9349       SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
9350     appendToRes(SubnormalRes);
9351   }
9352 
9353   if (FPClassTest PartialCheck = Mask & fcInf) {
9354     if (PartialCheck == fcPosInf)
9355       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9356                                        AsInt, InfC));
9357     else if (PartialCheck == fcInf)
9358       appendToRes(
9359           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
9360     else { // fcNegInf
9361       APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
9362       auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
9363       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
9364                                        AsInt, NegInfC));
9365     }
9366   }
9367 
9368   if (FPClassTest PartialCheck = Mask & fcNan) {
9369     auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
9370     if (PartialCheck == fcNan) {
9371       // isnan(V) ==> abs(V) u> int(inf)
9372       appendToRes(
9373           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
9374     } else if (PartialCheck == fcQNan) {
9375       // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
9376       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
9377                                        InfWithQnanBitC));
9378     } else { // fcSNan
9379       // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
9380       //                    abs(V) u< (unsigned(Inf) | quiet_bit)
9381       auto IsNan =
9382           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
9383       auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
9384                                             Abs, InfWithQnanBitC);
9385       appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
9386     }
9387   }
9388 
9389   if (FPClassTest PartialCheck = Mask & fcNormal) {
9390     // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
9391     // (max_exp-1))
9392     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
9393     auto ExpMinusOne = MIRBuilder.buildSub(
9394         IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
9395     APInt MaxExpMinusOne = ExpMask - ExpLSB;
9396     auto NormalRes =
9397         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
9398                              MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
9399     if (PartialCheck == fcNegNormal)
9400       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
9401     else if (PartialCheck == fcPosNormal) {
9402       auto PosSign = MIRBuilder.buildXor(
9403           DstTy, Sign, MIRBuilder.buildConstant(DstTy, InversionMask));
9404       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
9405     }
9406     appendToRes(NormalRes);
9407   }
9408 
9409   MIRBuilder.buildCopy(DstReg, Res);
9410   MI.eraseFromParent();
9411   return Legalized;
9412 }
9413 
9414 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
9415   // Implement G_SELECT in terms of XOR, AND, OR.
9416   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
9417       MI.getFirst4RegLLTs();
9418 
9419   bool IsEltPtr = DstTy.isPointerOrPointerVector();
9420   if (IsEltPtr) {
9421     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
9422     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
9423     Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
9424     Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
9425     DstTy = NewTy;
9426   }
9427 
9428   if (MaskTy.isScalar()) {
9429     // Turn the scalar condition into a vector condition mask if needed.
9430 
9431     Register MaskElt = MaskReg;
9432 
9433     // The condition was potentially zero extended before, but we want a sign
9434     // extended boolean.
9435     if (MaskTy != LLT::scalar(1))
9436       MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
9437 
9438     // Continue the sign extension (or truncate) to match the data type.
9439     MaskElt =
9440         MIRBuilder.buildSExtOrTrunc(DstTy.getScalarType(), MaskElt).getReg(0);
9441 
9442     if (DstTy.isVector()) {
9443       // Generate a vector splat idiom.
9444       auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
9445       MaskReg = ShufSplat.getReg(0);
9446     } else {
9447       MaskReg = MaskElt;
9448     }
9449     MaskTy = DstTy;
9450   } else if (!DstTy.isVector()) {
9451     // Cannot handle the case that mask is a vector and dst is a scalar.
9452     return UnableToLegalize;
9453   }
9454 
9455   if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
9456     return UnableToLegalize;
9457   }
9458 
9459   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
9460   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
9461   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
9462   if (IsEltPtr) {
9463     auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
9464     MIRBuilder.buildIntToPtr(DstReg, Or);
9465   } else {
9466     MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
9467   }
9468   MI.eraseFromParent();
9469   return Legalized;
9470 }
9471 
9472 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
9473   // Split DIVREM into individual instructions.
9474   unsigned Opcode = MI.getOpcode();
9475 
9476   MIRBuilder.buildInstr(
9477       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
9478                                         : TargetOpcode::G_UDIV,
9479       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9480   MIRBuilder.buildInstr(
9481       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
9482                                         : TargetOpcode::G_UREM,
9483       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
9484   MI.eraseFromParent();
9485   return Legalized;
9486 }
9487 
9488 LegalizerHelper::LegalizeResult
9489 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
9490   // Expand %res = G_ABS %a into:
9491   // %v1 = G_ASHR %a, scalar_size-1
9492   // %v2 = G_ADD %a, %v1
9493   // %res = G_XOR %v2, %v1
9494   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
9495   Register OpReg = MI.getOperand(1).getReg();
9496   auto ShiftAmt =
9497       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
9498   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
9499   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
9500   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
9501   MI.eraseFromParent();
9502   return Legalized;
9503 }
9504 
9505 LegalizerHelper::LegalizeResult
9506 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
9507   // Expand %res = G_ABS %a into:
9508   // %v1 = G_CONSTANT 0
9509   // %v2 = G_SUB %v1, %a
9510   // %res = G_SMAX %a, %v2
9511   Register SrcReg = MI.getOperand(1).getReg();
9512   LLT Ty = MRI.getType(SrcReg);
9513   auto Zero = MIRBuilder.buildConstant(Ty, 0);
9514   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg);
9515   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
9516   MI.eraseFromParent();
9517   return Legalized;
9518 }
9519 
9520 LegalizerHelper::LegalizeResult
9521 LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
9522   Register SrcReg = MI.getOperand(1).getReg();
9523   Register DestReg = MI.getOperand(0).getReg();
9524   LLT Ty = MRI.getType(SrcReg), IType = LLT::scalar(1);
9525   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
9526   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
9527   auto ICmp = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, IType, SrcReg, Zero);
9528   MIRBuilder.buildSelect(DestReg, ICmp, SrcReg, Sub);
9529   MI.eraseFromParent();
9530   return Legalized;
9531 }
9532 
9533 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFAbs(MachineInstr &MI) {
9534   Register SrcReg = MI.getOperand(1).getReg();
9535   Register DstReg = MI.getOperand(0).getReg();
9536 
9537   LLT Ty = MRI.getType(DstReg);
9538 
9539   // Reset sign bit
9540   MIRBuilder.buildAnd(
9541       DstReg, SrcReg,
9542       MIRBuilder.buildConstant(
9543           Ty, APInt::getSignedMaxValue(Ty.getScalarSizeInBits())));
9544 
9545   MI.eraseFromParent();
9546   return Legalized;
9547 }
9548 
9549 LegalizerHelper::LegalizeResult
9550 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
9551   Register SrcReg = MI.getOperand(1).getReg();
9552   LLT SrcTy = MRI.getType(SrcReg);
9553   LLT DstTy = MRI.getType(SrcReg);
9554 
9555   // The source could be a scalar if the IR type was <1 x sN>.
9556   if (SrcTy.isScalar()) {
9557     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
9558       return UnableToLegalize; // FIXME: handle extension.
9559     // This can be just a plain copy.
9560     Observer.changingInstr(MI);
9561     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
9562     Observer.changedInstr(MI);
9563     return Legalized;
9564   }
9565   return UnableToLegalize;
9566 }
9567 
9568 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
9569   MachineFunction &MF = *MI.getMF();
9570   const DataLayout &DL = MIRBuilder.getDataLayout();
9571   LLVMContext &Ctx = MF.getFunction().getContext();
9572   Register ListPtr = MI.getOperand(1).getReg();
9573   LLT PtrTy = MRI.getType(ListPtr);
9574 
9575   // LstPtr is a pointer to the head of the list. Get the address
9576   // of the head of the list.
9577   Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
9578   MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
9579       MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
9580   auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
9581 
9582   const Align A(MI.getOperand(2).getImm());
9583   LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
9584   if (A > TLI.getMinStackArgumentAlignment()) {
9585     Register AlignAmt =
9586         MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
9587     auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
9588     auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
9589     VAList = AndDst.getReg(0);
9590   }
9591 
9592   // Increment the pointer, VAList, to the next vaarg
9593   // The list should be bumped by the size of element in the current head of
9594   // list.
9595   Register Dst = MI.getOperand(0).getReg();
9596   LLT LLTTy = MRI.getType(Dst);
9597   Type *Ty = getTypeForLLT(LLTTy, Ctx);
9598   auto IncAmt =
9599       MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
9600   auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
9601 
9602   // Store the increment VAList to the legalized pointer
9603   MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
9604       MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
9605   MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
9606   // Load the actual argument out of the pointer VAList
9607   Align EltAlignment = DL.getABITypeAlign(Ty);
9608   MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
9609       MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
9610   MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
9611 
9612   MI.eraseFromParent();
9613   return Legalized;
9614 }
9615 
9616 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
9617   // On Darwin, -Os means optimize for size without hurting performance, so
9618   // only really optimize for size when -Oz (MinSize) is used.
9619   if (MF.getTarget().getTargetTriple().isOSDarwin())
9620     return MF.getFunction().hasMinSize();
9621   return MF.getFunction().hasOptSize();
9622 }
9623 
9624 // Returns a list of types to use for memory op lowering in MemOps. A partial
9625 // port of findOptimalMemOpLowering in TargetLowering.
9626 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
9627                                           unsigned Limit, const MemOp &Op,
9628                                           unsigned DstAS, unsigned SrcAS,
9629                                           const AttributeList &FuncAttributes,
9630                                           const TargetLowering &TLI) {
9631   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
9632     return false;
9633 
9634   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
9635 
9636   if (Ty == LLT()) {
9637     // Use the largest scalar type whose alignment constraints are satisfied.
9638     // We only need to check DstAlign here as SrcAlign is always greater or
9639     // equal to DstAlign (or zero).
9640     Ty = LLT::scalar(64);
9641     if (Op.isFixedDstAlign())
9642       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
9643              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
9644         Ty = LLT::scalar(Ty.getSizeInBytes());
9645     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
9646     // FIXME: check for the largest legal type we can load/store to.
9647   }
9648 
9649   unsigned NumMemOps = 0;
9650   uint64_t Size = Op.size();
9651   while (Size) {
9652     unsigned TySize = Ty.getSizeInBytes();
9653     while (TySize > Size) {
9654       // For now, only use non-vector load / store's for the left-over pieces.
9655       LLT NewTy = Ty;
9656       // FIXME: check for mem op safety and legality of the types. Not all of
9657       // SDAGisms map cleanly to GISel concepts.
9658       if (NewTy.isVector())
9659         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
9660       NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
9661       unsigned NewTySize = NewTy.getSizeInBytes();
9662       assert(NewTySize > 0 && "Could not find appropriate type");
9663 
9664       // If the new LLT cannot cover all of the remaining bits, then consider
9665       // issuing a (or a pair of) unaligned and overlapping load / store.
9666       unsigned Fast;
9667       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
9668       MVT VT = getMVTForLLT(Ty);
9669       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
9670           TLI.allowsMisalignedMemoryAccesses(
9671               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
9672               MachineMemOperand::MONone, &Fast) &&
9673           Fast)
9674         TySize = Size;
9675       else {
9676         Ty = NewTy;
9677         TySize = NewTySize;
9678       }
9679     }
9680 
9681     if (++NumMemOps > Limit)
9682       return false;
9683 
9684     MemOps.push_back(Ty);
9685     Size -= TySize;
9686   }
9687 
9688   return true;
9689 }
9690 
9691 // Get a vectorized representation of the memset value operand, GISel edition.
9692 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
9693   MachineRegisterInfo &MRI = *MIB.getMRI();
9694   unsigned NumBits = Ty.getScalarSizeInBits();
9695   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9696   if (!Ty.isVector() && ValVRegAndVal) {
9697     APInt Scalar = ValVRegAndVal->Value.trunc(8);
9698     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
9699     return MIB.buildConstant(Ty, SplatVal).getReg(0);
9700   }
9701 
9702   // Extend the byte value to the larger type, and then multiply by a magic
9703   // value 0x010101... in order to replicate it across every byte.
9704   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
9705   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
9706     return MIB.buildConstant(Ty, 0).getReg(0);
9707   }
9708 
9709   LLT ExtType = Ty.getScalarType();
9710   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
9711   if (NumBits > 8) {
9712     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
9713     auto MagicMI = MIB.buildConstant(ExtType, Magic);
9714     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
9715   }
9716 
9717   // For vector types create a G_BUILD_VECTOR.
9718   if (Ty.isVector())
9719     Val = MIB.buildSplatBuildVector(Ty, Val).getReg(0);
9720 
9721   return Val;
9722 }
9723 
9724 LegalizerHelper::LegalizeResult
9725 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
9726                              uint64_t KnownLen, Align Alignment,
9727                              bool IsVolatile) {
9728   auto &MF = *MI.getParent()->getParent();
9729   const auto &TLI = *MF.getSubtarget().getTargetLowering();
9730   auto &DL = MF.getDataLayout();
9731   LLVMContext &C = MF.getFunction().getContext();
9732 
9733   assert(KnownLen != 0 && "Have a zero length memset length!");
9734 
9735   bool DstAlignCanChange = false;
9736   MachineFrameInfo &MFI = MF.getFrameInfo();
9737   bool OptSize = shouldLowerMemFuncForSize(MF);
9738 
9739   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9740   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9741     DstAlignCanChange = true;
9742 
9743   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
9744   std::vector<LLT> MemOps;
9745 
9746   const auto &DstMMO = **MI.memoperands_begin();
9747   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9748 
9749   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
9750   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
9751 
9752   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
9753                                      MemOp::Set(KnownLen, DstAlignCanChange,
9754                                                 Alignment,
9755                                                 /*IsZeroMemset=*/IsZeroVal,
9756                                                 /*IsVolatile=*/IsVolatile),
9757                                      DstPtrInfo.getAddrSpace(), ~0u,
9758                                      MF.getFunction().getAttributes(), TLI))
9759     return UnableToLegalize;
9760 
9761   if (DstAlignCanChange) {
9762     // Get an estimate of the type from the LLT.
9763     Type *IRTy = getTypeForLLT(MemOps[0], C);
9764     Align NewAlign = DL.getABITypeAlign(IRTy);
9765     if (NewAlign > Alignment) {
9766       Alignment = NewAlign;
9767       unsigned FI = FIDef->getOperand(1).getIndex();
9768       // Give the stack frame object a larger alignment if needed.
9769       if (MFI.getObjectAlign(FI) < Alignment)
9770         MFI.setObjectAlignment(FI, Alignment);
9771     }
9772   }
9773 
9774   MachineIRBuilder MIB(MI);
9775   // Find the largest store and generate the bit pattern for it.
9776   LLT LargestTy = MemOps[0];
9777   for (unsigned i = 1; i < MemOps.size(); i++)
9778     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
9779       LargestTy = MemOps[i];
9780 
9781   // The memset stored value is always defined as an s8, so in order to make it
9782   // work with larger store types we need to repeat the bit pattern across the
9783   // wider type.
9784   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
9785 
9786   if (!MemSetValue)
9787     return UnableToLegalize;
9788 
9789   // Generate the stores. For each store type in the list, we generate the
9790   // matching store of that type to the destination address.
9791   LLT PtrTy = MRI.getType(Dst);
9792   unsigned DstOff = 0;
9793   unsigned Size = KnownLen;
9794   for (unsigned I = 0; I < MemOps.size(); I++) {
9795     LLT Ty = MemOps[I];
9796     unsigned TySize = Ty.getSizeInBytes();
9797     if (TySize > Size) {
9798       // Issuing an unaligned load / store pair that overlaps with the previous
9799       // pair. Adjust the offset accordingly.
9800       assert(I == MemOps.size() - 1 && I != 0);
9801       DstOff -= TySize - Size;
9802     }
9803 
9804     // If this store is smaller than the largest store see whether we can get
9805     // the smaller value for free with a truncate.
9806     Register Value = MemSetValue;
9807     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
9808       MVT VT = getMVTForLLT(Ty);
9809       MVT LargestVT = getMVTForLLT(LargestTy);
9810       if (!LargestTy.isVector() && !Ty.isVector() &&
9811           TLI.isTruncateFree(LargestVT, VT))
9812         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
9813       else
9814         Value = getMemsetValue(Val, Ty, MIB);
9815       if (!Value)
9816         return UnableToLegalize;
9817     }
9818 
9819     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
9820 
9821     Register Ptr = Dst;
9822     if (DstOff != 0) {
9823       auto Offset =
9824           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
9825       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
9826     }
9827 
9828     MIB.buildStore(Value, Ptr, *StoreMMO);
9829     DstOff += Ty.getSizeInBytes();
9830     Size -= TySize;
9831   }
9832 
9833   MI.eraseFromParent();
9834   return Legalized;
9835 }
9836 
9837 LegalizerHelper::LegalizeResult
9838 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
9839   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9840 
9841   auto [Dst, Src, Len] = MI.getFirst3Regs();
9842 
9843   const auto *MMOIt = MI.memoperands_begin();
9844   const MachineMemOperand *MemOp = *MMOIt;
9845   bool IsVolatile = MemOp->isVolatile();
9846 
9847   // See if this is a constant length copy
9848   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
9849   // FIXME: support dynamically sized G_MEMCPY_INLINE
9850   assert(LenVRegAndVal &&
9851          "inline memcpy with dynamic size is not yet supported");
9852   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
9853   if (KnownLen == 0) {
9854     MI.eraseFromParent();
9855     return Legalized;
9856   }
9857 
9858   const auto &DstMMO = **MI.memoperands_begin();
9859   const auto &SrcMMO = **std::next(MI.memoperands_begin());
9860   Align DstAlign = DstMMO.getBaseAlign();
9861   Align SrcAlign = SrcMMO.getBaseAlign();
9862 
9863   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
9864                            IsVolatile);
9865 }
9866 
9867 LegalizerHelper::LegalizeResult
9868 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
9869                                    uint64_t KnownLen, Align DstAlign,
9870                                    Align SrcAlign, bool IsVolatile) {
9871   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
9872   return lowerMemcpy(MI, Dst, Src, KnownLen,
9873                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
9874                      IsVolatile);
9875 }
9876 
9877 LegalizerHelper::LegalizeResult
9878 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
9879                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
9880                              Align SrcAlign, bool IsVolatile) {
9881   auto &MF = *MI.getParent()->getParent();
9882   const auto &TLI = *MF.getSubtarget().getTargetLowering();
9883   auto &DL = MF.getDataLayout();
9884   LLVMContext &C = MF.getFunction().getContext();
9885 
9886   assert(KnownLen != 0 && "Have a zero length memcpy length!");
9887 
9888   bool DstAlignCanChange = false;
9889   MachineFrameInfo &MFI = MF.getFrameInfo();
9890   Align Alignment = std::min(DstAlign, SrcAlign);
9891 
9892   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
9893   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
9894     DstAlignCanChange = true;
9895 
9896   // FIXME: infer better src pointer alignment like SelectionDAG does here.
9897   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
9898   // if the memcpy is in a tail call position.
9899 
9900   std::vector<LLT> MemOps;
9901 
9902   const auto &DstMMO = **MI.memoperands_begin();
9903   const auto &SrcMMO = **std::next(MI.memoperands_begin());
9904   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
9905   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
9906 
9907   if (!findGISelOptimalMemOpLowering(
9908           MemOps, Limit,
9909           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
9910                       IsVolatile),
9911           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
9912           MF.getFunction().getAttributes(), TLI))
9913     return UnableToLegalize;
9914 
9915   if (DstAlignCanChange) {
9916     // Get an estimate of the type from the LLT.
9917     Type *IRTy = getTypeForLLT(MemOps[0], C);
9918     Align NewAlign = DL.getABITypeAlign(IRTy);
9919 
9920     // Don't promote to an alignment that would require dynamic stack
9921     // realignment.
9922     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
9923     if (!TRI->hasStackRealignment(MF))
9924       if (MaybeAlign StackAlign = DL.getStackAlignment())
9925         NewAlign = std::min(NewAlign, *StackAlign);
9926 
9927     if (NewAlign > Alignment) {
9928       Alignment = NewAlign;
9929       unsigned FI = FIDef->getOperand(1).getIndex();
9930       // Give the stack frame object a larger alignment if needed.
9931       if (MFI.getObjectAlign(FI) < Alignment)
9932         MFI.setObjectAlignment(FI, Alignment);
9933     }
9934   }
9935 
9936   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
9937 
9938   MachineIRBuilder MIB(MI);
9939   // Now we need to emit a pair of load and stores for each of the types we've
9940   // collected. I.e. for each type, generate a load from the source pointer of
9941   // that type width, and then generate a corresponding store to the dest buffer
9942   // of that value loaded. This can result in a sequence of loads and stores
9943   // mixed types, depending on what the target specifies as good types to use.
9944   unsigned CurrOffset = 0;
9945   unsigned Size = KnownLen;
9946   for (auto CopyTy : MemOps) {
9947     // Issuing an unaligned load / store pair  that overlaps with the previous
9948     // pair. Adjust the offset accordingly.
9949     if (CopyTy.getSizeInBytes() > Size)
9950       CurrOffset -= CopyTy.getSizeInBytes() - Size;
9951 
9952     // Construct MMOs for the accesses.
9953     auto *LoadMMO =
9954         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
9955     auto *StoreMMO =
9956         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
9957 
9958     // Create the load.
9959     Register LoadPtr = Src;
9960     Register Offset;
9961     if (CurrOffset != 0) {
9962       LLT SrcTy = MRI.getType(Src);
9963       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
9964                    .getReg(0);
9965       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
9966     }
9967     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
9968 
9969     // Create the store.
9970     Register StorePtr = Dst;
9971     if (CurrOffset != 0) {
9972       LLT DstTy = MRI.getType(Dst);
9973       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
9974     }
9975     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
9976     CurrOffset += CopyTy.getSizeInBytes();
9977     Size -= CopyTy.getSizeInBytes();
9978   }
9979 
9980   MI.eraseFromParent();
9981   return Legalized;
9982 }
9983 
9984 LegalizerHelper::LegalizeResult
9985 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
9986                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
9987                               bool IsVolatile) {
9988   auto &MF = *MI.getParent()->getParent();
9989   const auto &TLI = *MF.getSubtarget().getTargetLowering();
9990   auto &DL = MF.getDataLayout();
9991   LLVMContext &C = MF.getFunction().getContext();
9992 
9993   assert(KnownLen != 0 && "Have a zero length memmove length!");
9994 
9995   bool DstAlignCanChange = false;
9996   MachineFrameInfo &MFI = MF.getFrameInfo();
9997   bool OptSize = shouldLowerMemFuncForSize(MF);
9998   Align Alignment = std::min(DstAlign, SrcAlign);
9999 
10000   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
10001   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
10002     DstAlignCanChange = true;
10003 
10004   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
10005   std::vector<LLT> MemOps;
10006 
10007   const auto &DstMMO = **MI.memoperands_begin();
10008   const auto &SrcMMO = **std::next(MI.memoperands_begin());
10009   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
10010   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
10011 
10012   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
10013   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
10014   // same thing here.
10015   if (!findGISelOptimalMemOpLowering(
10016           MemOps, Limit,
10017           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
10018                       /*IsVolatile*/ true),
10019           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
10020           MF.getFunction().getAttributes(), TLI))
10021     return UnableToLegalize;
10022 
10023   if (DstAlignCanChange) {
10024     // Get an estimate of the type from the LLT.
10025     Type *IRTy = getTypeForLLT(MemOps[0], C);
10026     Align NewAlign = DL.getABITypeAlign(IRTy);
10027 
10028     // Don't promote to an alignment that would require dynamic stack
10029     // realignment.
10030     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
10031     if (!TRI->hasStackRealignment(MF))
10032       if (MaybeAlign StackAlign = DL.getStackAlignment())
10033         NewAlign = std::min(NewAlign, *StackAlign);
10034 
10035     if (NewAlign > Alignment) {
10036       Alignment = NewAlign;
10037       unsigned FI = FIDef->getOperand(1).getIndex();
10038       // Give the stack frame object a larger alignment if needed.
10039       if (MFI.getObjectAlign(FI) < Alignment)
10040         MFI.setObjectAlignment(FI, Alignment);
10041     }
10042   }
10043 
10044   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
10045 
10046   MachineIRBuilder MIB(MI);
10047   // Memmove requires that we perform the loads first before issuing the stores.
10048   // Apart from that, this loop is pretty much doing the same thing as the
10049   // memcpy codegen function.
10050   unsigned CurrOffset = 0;
10051   SmallVector<Register, 16> LoadVals;
10052   for (auto CopyTy : MemOps) {
10053     // Construct MMO for the load.
10054     auto *LoadMMO =
10055         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
10056 
10057     // Create the load.
10058     Register LoadPtr = Src;
10059     if (CurrOffset != 0) {
10060       LLT SrcTy = MRI.getType(Src);
10061       auto Offset =
10062           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
10063       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
10064     }
10065     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
10066     CurrOffset += CopyTy.getSizeInBytes();
10067   }
10068 
10069   CurrOffset = 0;
10070   for (unsigned I = 0; I < MemOps.size(); ++I) {
10071     LLT CopyTy = MemOps[I];
10072     // Now store the values loaded.
10073     auto *StoreMMO =
10074         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
10075 
10076     Register StorePtr = Dst;
10077     if (CurrOffset != 0) {
10078       LLT DstTy = MRI.getType(Dst);
10079       auto Offset =
10080           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
10081       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
10082     }
10083     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
10084     CurrOffset += CopyTy.getSizeInBytes();
10085   }
10086   MI.eraseFromParent();
10087   return Legalized;
10088 }
10089 
10090 LegalizerHelper::LegalizeResult
10091 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
10092   const unsigned Opc = MI.getOpcode();
10093   // This combine is fairly complex so it's not written with a separate
10094   // matcher function.
10095   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
10096           Opc == TargetOpcode::G_MEMSET) &&
10097          "Expected memcpy like instruction");
10098 
10099   auto MMOIt = MI.memoperands_begin();
10100   const MachineMemOperand *MemOp = *MMOIt;
10101 
10102   Align DstAlign = MemOp->getBaseAlign();
10103   Align SrcAlign;
10104   auto [Dst, Src, Len] = MI.getFirst3Regs();
10105 
10106   if (Opc != TargetOpcode::G_MEMSET) {
10107     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
10108     MemOp = *(++MMOIt);
10109     SrcAlign = MemOp->getBaseAlign();
10110   }
10111 
10112   // See if this is a constant length copy
10113   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
10114   if (!LenVRegAndVal)
10115     return UnableToLegalize;
10116   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
10117 
10118   if (KnownLen == 0) {
10119     MI.eraseFromParent();
10120     return Legalized;
10121   }
10122 
10123   bool IsVolatile = MemOp->isVolatile();
10124   // Don't try to optimize volatile.
10125   if (IsVolatile)
10126     return UnableToLegalize;
10127 
10128   if (MaxLen && KnownLen > MaxLen)
10129     return UnableToLegalize;
10130 
10131   if (Opc == TargetOpcode::G_MEMCPY) {
10132     auto &MF = *MI.getParent()->getParent();
10133     const auto &TLI = *MF.getSubtarget().getTargetLowering();
10134     bool OptSize = shouldLowerMemFuncForSize(MF);
10135     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
10136     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
10137                        IsVolatile);
10138   }
10139   if (Opc == TargetOpcode::G_MEMMOVE)
10140     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
10141   if (Opc == TargetOpcode::G_MEMSET)
10142     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
10143   return UnableToLegalize;
10144 }
10145